uu_sort/
sort.rs

1// This file is part of the uutils coreutils package.
2//
3// For the full copyright and license information, please view the LICENSE
4// file that was distributed with this source code.
5
6// Although these links don't always seem to describe reality, check out the POSIX and GNU specs:
7// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
8// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
9
10// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit
11
12mod check;
13mod chunks;
14mod custom_str_cmp;
15mod ext_sort;
16mod merge;
17mod numeric_str_cmp;
18mod tmp_dir;
19
20use bigdecimal::BigDecimal;
21use chunks::LineData;
22use clap::builder::ValueParser;
23use clap::{Arg, ArgAction, Command};
24use custom_str_cmp::custom_str_cmp;
25use ext_sort::ext_sort;
26use fnv::FnvHasher;
27#[cfg(target_os = "linux")]
28use nix::libc::{RLIMIT_NOFILE, getrlimit, rlimit};
29use numeric_str_cmp::{NumInfo, NumInfoParseSettings, human_numeric_str_cmp, numeric_str_cmp};
30use rand::{Rng, rng};
31use rayon::prelude::*;
32use std::cmp::Ordering;
33use std::env;
34use std::ffi::{OsStr, OsString};
35use std::fs::{File, OpenOptions};
36use std::hash::{Hash, Hasher};
37use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout};
38use std::num::IntErrorKind;
39use std::ops::Range;
40use std::path::Path;
41use std::path::PathBuf;
42use std::str::Utf8Error;
43use thiserror::Error;
44use uucore::display::Quotable;
45use uucore::error::{FromIo, strip_errno};
46use uucore::error::{UError, UResult, USimpleError, UUsageError};
47use uucore::extendedbigdecimal::ExtendedBigDecimal;
48use uucore::format_usage;
49use uucore::line_ending::LineEnding;
50use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
51use uucore::parser::parse_size::{ParseSizeError, Parser};
52use uucore::parser::shortcut_value_parser::ShortcutValueParser;
53use uucore::show_error;
54use uucore::translate;
55use uucore::version_cmp::version_cmp;
56
57use crate::tmp_dir::TmpDirWrapper;
58
59mod options {
60    pub mod modes {
61        pub const SORT: &str = "sort";
62
63        pub const HUMAN_NUMERIC: &str = "human-numeric-sort";
64        pub const MONTH: &str = "month-sort";
65        pub const NUMERIC: &str = "numeric-sort";
66        pub const GENERAL_NUMERIC: &str = "general-numeric-sort";
67        pub const VERSION: &str = "version-sort";
68        pub const RANDOM: &str = "random-sort";
69
70        pub const ALL_SORT_MODES: [&str; 6] = [
71            GENERAL_NUMERIC,
72            HUMAN_NUMERIC,
73            MONTH,
74            NUMERIC,
75            VERSION,
76            RANDOM,
77        ];
78    }
79
80    pub mod check {
81        pub const CHECK: &str = "check";
82        pub const CHECK_SILENT: &str = "check-silent";
83        pub const SILENT: &str = "silent";
84        pub const QUIET: &str = "quiet";
85        pub const DIAGNOSE_FIRST: &str = "diagnose-first";
86    }
87
88    pub const HELP: &str = "help";
89    pub const VERSION: &str = "version";
90    pub const DICTIONARY_ORDER: &str = "dictionary-order";
91    pub const MERGE: &str = "merge";
92    pub const DEBUG: &str = "debug";
93    pub const IGNORE_CASE: &str = "ignore-case";
94    pub const IGNORE_LEADING_BLANKS: &str = "ignore-leading-blanks";
95    pub const IGNORE_NONPRINTING: &str = "ignore-nonprinting";
96    pub const OUTPUT: &str = "output";
97    pub const REVERSE: &str = "reverse";
98    pub const STABLE: &str = "stable";
99    pub const UNIQUE: &str = "unique";
100    pub const KEY: &str = "key";
101    pub const SEPARATOR: &str = "field-separator";
102    pub const ZERO_TERMINATED: &str = "zero-terminated";
103    pub const PARALLEL: &str = "parallel";
104    pub const FILES0_FROM: &str = "files0-from";
105    pub const BUF_SIZE: &str = "buffer-size";
106    pub const TMP_DIR: &str = "temporary-directory";
107    pub const COMPRESS_PROG: &str = "compress-program";
108    pub const BATCH_SIZE: &str = "batch-size";
109
110    pub const FILES: &str = "files";
111}
112
113const DECIMAL_PT: u8 = b'.';
114
115const NEGATIVE: &u8 = &b'-';
116const POSITIVE: &u8 = &b'+';
117
118// Choosing a higher buffer size does not result in performance improvements
119// (at least not on my machine). TODO: In the future, we should also take the amount of
120// available memory into consideration, instead of relying on this constant only.
121const DEFAULT_BUF_SIZE: usize = 1_000_000_000; // 1 GB
122
123#[derive(Debug, Error)]
124pub enum SortError {
125    #[error("{}", format_disorder(.file, .line_number, .line, .silent))]
126    Disorder {
127        file: OsString,
128        line_number: usize,
129        line: String,
130        silent: bool,
131    },
132
133    #[error("{}", translate!("sort-open-failed", "path" => format!("{}", .path.maybe_quote()), "error" => strip_errno(.error)))]
134    OpenFailed {
135        path: PathBuf,
136        error: std::io::Error,
137    },
138
139    #[error("{}", translate!("sort-parse-key-error", "key" => .key.quote(), "msg" => .msg.clone()))]
140    ParseKeyError { key: String, msg: String },
141
142    #[error("{}", translate!("sort-cannot-read", "path" => format!("{}", .path.maybe_quote()), "error" => strip_errno(.error)))]
143    ReadFailed {
144        path: PathBuf,
145        error: std::io::Error,
146    },
147
148    #[error("{}", translate!("sort-open-tmp-file-failed", "error" => strip_errno(.error)))]
149    OpenTmpFileFailed { error: std::io::Error },
150
151    #[error("{}", translate!("sort-compress-prog-execution-failed", "code" => .code))]
152    CompressProgExecutionFailed { code: i32 },
153
154    #[error("{}", translate!("sort-compress-prog-terminated-abnormally", "prog" => .prog.quote()))]
155    CompressProgTerminatedAbnormally { prog: String },
156
157    #[error("{}", translate!("sort-cannot-create-tmp-file", "path" => format!("{}", .path.display())))]
158    TmpFileCreationFailed { path: PathBuf },
159
160    #[error("{}", translate!("sort-file-operands-combined", "file" => format!("{}", .file.display()), "help" => uucore::execution_phrase()))]
161    FileOperandsCombined { file: PathBuf },
162
163    #[error("{error}")]
164    Uft8Error { error: Utf8Error },
165
166    #[error("{}", translate!("sort-multiple-output-files"))]
167    MultipleOutputFiles,
168
169    #[error("{}", translate!("sort-minus-in-stdin"))]
170    MinusInStdIn,
171
172    #[error("{}", translate!("sort-no-input-from", "file" => format!("{}", .file.display())))]
173    EmptyInputFile { file: PathBuf },
174
175    #[error("{}", translate!("sort-invalid-zero-length-filename", "file" => format!("{}", .file.display()), "line_num" => .line_num))]
176    ZeroLengthFileName { file: PathBuf, line_num: usize },
177}
178
179impl UError for SortError {
180    fn code(&self) -> i32 {
181        match self {
182            Self::Disorder { .. } => 1,
183            _ => 2,
184        }
185    }
186}
187
188fn format_disorder(file: &OsString, line_number: &usize, line: &String, silent: &bool) -> String {
189    if *silent {
190        String::new()
191    } else {
192        translate!("sort-error-disorder", "file" => file.maybe_quote(), "line_number" => line_number, "line" => line.to_owned())
193    }
194}
195
196#[derive(Eq, Ord, PartialEq, PartialOrd, Clone, Copy, Debug)]
197enum SortMode {
198    Numeric,
199    HumanNumeric,
200    GeneralNumeric,
201    Month,
202    Version,
203    Random,
204    Default,
205}
206
207impl SortMode {
208    fn get_short_name(&self) -> Option<char> {
209        match self {
210            Self::Numeric => Some('n'),
211            Self::HumanNumeric => Some('h'),
212            Self::GeneralNumeric => Some('g'),
213            Self::Month => Some('M'),
214            Self::Version => Some('V'),
215            Self::Random => Some('R'),
216            Self::Default => None,
217        }
218    }
219}
220
221pub struct Output {
222    file: Option<(OsString, File)>,
223}
224
225impl Output {
226    fn new(name: Option<impl AsRef<OsStr>>) -> UResult<Self> {
227        let file = if let Some(name) = name {
228            let path = Path::new(name.as_ref());
229            // This is different from `File::create()` because we don't truncate the output yet.
230            // This allows using the output file as an input file.
231            #[allow(clippy::suspicious_open_options)]
232            let file = OpenOptions::new()
233                .write(true)
234                .create(true)
235                .open(path)
236                .map_err(|e| SortError::OpenFailed {
237                    path: path.to_owned(),
238                    error: e,
239                })?;
240            Some((name.as_ref().to_owned(), file))
241        } else {
242            None
243        };
244        Ok(Self { file })
245    }
246
247    fn into_write(self) -> BufWriter<Box<dyn Write>> {
248        BufWriter::new(match self.file {
249            Some((_name, file)) => {
250                // truncate the file
251                let _ = file.set_len(0);
252                Box::new(file)
253            }
254            None => Box::new(stdout()),
255        })
256    }
257
258    fn as_output_name(&self) -> Option<&OsStr> {
259        match &self.file {
260            Some((name, _file)) => Some(name.as_os_str()),
261            None => None,
262        }
263    }
264}
265
266#[derive(Clone)]
267pub struct GlobalSettings {
268    mode: SortMode,
269    debug: bool,
270    ignore_leading_blanks: bool,
271    ignore_case: bool,
272    dictionary_order: bool,
273    ignore_non_printing: bool,
274    merge: bool,
275    reverse: bool,
276    stable: bool,
277    unique: bool,
278    check: bool,
279    check_silent: bool,
280    salt: Option<[u8; 16]>,
281    selectors: Vec<FieldSelector>,
282    separator: Option<u8>,
283    threads: String,
284    line_ending: LineEnding,
285    buffer_size: usize,
286    compress_prog: Option<String>,
287    merge_batch_size: usize,
288    precomputed: Precomputed,
289}
290
291/// Data needed for sorting. Should be computed once before starting to sort
292/// by calling `GlobalSettings::init_precomputed`.
293#[derive(Clone, Debug, Default)]
294struct Precomputed {
295    needs_tokens: bool,
296    num_infos_per_line: usize,
297    floats_per_line: usize,
298    selections_per_line: usize,
299}
300
301impl GlobalSettings {
302    /// Parse a SIZE string into a number of bytes.
303    /// A size string comprises an integer and an optional unit.
304    /// The unit may be k, K, m, M, g, G, t, T, P, E, Z, Y (powers of 1024), or b which is 1.
305    /// Default is K.
306    fn parse_byte_count(input: &str) -> Result<usize, ParseSizeError> {
307        // GNU sort (8.32)   valid: 1b,        k, K, m, M, g, G, t, T, P, E, Z, Y
308        // GNU sort (8.32) invalid:  b, B, 1B,                         p, e, z, y
309        let size = Parser::default()
310            .with_allow_list(&[
311                "b", "k", "K", "m", "M", "g", "G", "t", "T", "P", "E", "Z", "Y", "R", "Q", "%",
312            ])
313            .with_default_unit("K")
314            .with_b_byte_count(true)
315            .parse(input.trim())?;
316
317        usize::try_from(size).map_err(|_| {
318            ParseSizeError::SizeTooBig(translate!("sort-error-buffer-size-too-big", "size" => size))
319        })
320    }
321
322    /// Precompute some data needed for sorting.
323    /// This function **must** be called before starting to sort, and `GlobalSettings` may not be altered
324    /// afterwards.
325    fn init_precomputed(&mut self) {
326        self.precomputed.needs_tokens = self.selectors.iter().any(|s| s.needs_tokens);
327        self.precomputed.selections_per_line =
328            self.selectors.iter().filter(|s| s.needs_selection).count();
329        self.precomputed.num_infos_per_line = self
330            .selectors
331            .iter()
332            .filter(|s| matches!(s.settings.mode, SortMode::Numeric | SortMode::HumanNumeric))
333            .count();
334        self.precomputed.floats_per_line = self
335            .selectors
336            .iter()
337            .filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric))
338            .count();
339    }
340}
341
342impl Default for GlobalSettings {
343    fn default() -> Self {
344        Self {
345            mode: SortMode::Default,
346            debug: false,
347            ignore_leading_blanks: false,
348            ignore_case: false,
349            dictionary_order: false,
350            ignore_non_printing: false,
351            merge: false,
352            reverse: false,
353            stable: false,
354            unique: false,
355            check: false,
356            check_silent: false,
357            salt: None,
358            selectors: vec![],
359            separator: None,
360            threads: String::new(),
361            line_ending: LineEnding::Newline,
362            buffer_size: DEFAULT_BUF_SIZE,
363            compress_prog: None,
364            merge_batch_size: 32,
365            precomputed: Precomputed::default(),
366        }
367    }
368}
369
370#[derive(Clone, PartialEq, Debug)]
371struct KeySettings {
372    mode: SortMode,
373    ignore_blanks: bool,
374    ignore_case: bool,
375    dictionary_order: bool,
376    ignore_non_printing: bool,
377    reverse: bool,
378}
379
380impl KeySettings {
381    /// Checks if the supplied combination of `mode`, `ignore_non_printing` and `dictionary_order` is allowed.
382    fn check_compatibility(
383        mode: SortMode,
384        ignore_non_printing: bool,
385        dictionary_order: bool,
386    ) -> Result<(), String> {
387        if matches!(
388            mode,
389            SortMode::Numeric | SortMode::HumanNumeric | SortMode::GeneralNumeric | SortMode::Month
390        ) {
391            if dictionary_order {
392                return Err(
393                    translate!("sort-options-incompatible", "opt1" => "d", "opt2" => mode.get_short_name().unwrap()),
394                );
395            } else if ignore_non_printing {
396                return Err(
397                    translate!("sort-options-incompatible", "opt1" => "i", "opt2" => mode.get_short_name().unwrap()),
398                );
399            }
400        }
401        Ok(())
402    }
403
404    fn set_sort_mode(&mut self, mode: SortMode) -> Result<(), String> {
405        if self.mode != SortMode::Default && self.mode != mode {
406            return Err(
407                translate!("sort-options-incompatible", "opt1" => self.mode.get_short_name().unwrap(), "opt2" => mode.get_short_name().unwrap()),
408            );
409        }
410        Self::check_compatibility(mode, self.ignore_non_printing, self.dictionary_order)?;
411        self.mode = mode;
412        Ok(())
413    }
414
415    fn set_dictionary_order(&mut self) -> Result<(), String> {
416        Self::check_compatibility(self.mode, self.ignore_non_printing, true)?;
417        self.dictionary_order = true;
418        Ok(())
419    }
420
421    fn set_ignore_non_printing(&mut self) -> Result<(), String> {
422        Self::check_compatibility(self.mode, true, self.dictionary_order)?;
423        self.ignore_non_printing = true;
424        Ok(())
425    }
426}
427
428impl From<&GlobalSettings> for KeySettings {
429    fn from(settings: &GlobalSettings) -> Self {
430        Self {
431            mode: settings.mode,
432            ignore_blanks: settings.ignore_leading_blanks,
433            ignore_case: settings.ignore_case,
434            ignore_non_printing: settings.ignore_non_printing,
435            reverse: settings.reverse,
436            dictionary_order: settings.dictionary_order,
437        }
438    }
439}
440
441impl Default for KeySettings {
442    fn default() -> Self {
443        Self::from(&GlobalSettings::default())
444    }
445}
446enum Selection<'a> {
447    AsBigDecimal(GeneralBigDecimalParseResult),
448    WithNumInfo(&'a [u8], NumInfo),
449    Str(&'a [u8]),
450}
451
452type Field = Range<usize>;
453
454#[derive(Clone, Debug)]
455pub struct Line<'a> {
456    line: &'a [u8],
457    index: usize,
458}
459
460impl<'a> Line<'a> {
461    /// Creates a new `Line`.
462    ///
463    /// If additional data is needed for sorting it is added to `line_data`.
464    /// `token_buffer` allows to reuse the allocation for tokens.
465    fn create(
466        line: &'a [u8],
467        index: usize,
468        line_data: &mut LineData<'a>,
469        token_buffer: &mut Vec<Field>,
470        settings: &GlobalSettings,
471    ) -> Self {
472        token_buffer.clear();
473        if settings.precomputed.needs_tokens {
474            tokenize(line, settings.separator, token_buffer);
475        }
476        if settings.mode == SortMode::Numeric {
477            // exclude inf, nan, scientific notation
478            let line_num_float = (!line.iter().any(u8::is_ascii_alphabetic))
479                .then(|| std::str::from_utf8(line).ok())
480                .flatten()
481                .and_then(|s| s.parse::<f64>().ok());
482            line_data.line_num_floats.push(line_num_float);
483        }
484        for (selector, selection) in settings
485            .selectors
486            .iter()
487            .map(|selector| (selector, selector.get_selection(line, token_buffer)))
488        {
489            match selection {
490                Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float),
491                Selection::WithNumInfo(str, num_info) => {
492                    line_data.num_infos.push(num_info);
493                    line_data.selections.push(str);
494                }
495                Selection::Str(str) => {
496                    if selector.needs_selection {
497                        line_data.selections.push(str);
498                    }
499                }
500            }
501        }
502        Self { line, index }
503    }
504
505    fn print(&self, writer: &mut impl Write, settings: &GlobalSettings) -> std::io::Result<()> {
506        if settings.debug {
507            self.print_debug(settings, writer)?;
508        } else {
509            writer.write_all(self.line)?;
510            writer.write_all(&[settings.line_ending.into()])?;
511        }
512        Ok(())
513    }
514
515    /// Writes indicators for the selections this line matched. The original line content is NOT expected
516    /// to be already printed.
517    fn print_debug(
518        &self,
519        settings: &GlobalSettings,
520        writer: &mut impl Write,
521    ) -> std::io::Result<()> {
522        // We do not consider this function performance critical, as debug output is only useful for small files,
523        // which are not a performance problem in any case. Therefore there aren't any special performance
524        // optimizations here.
525
526        let line = self
527            .line
528            .iter()
529            .copied()
530            .map(|c| if c == b'\t' { b'>' } else { c })
531            .collect::<Vec<_>>();
532
533        writer.write_all(&line)?;
534        writeln!(writer)?;
535
536        let mut fields = vec![];
537        tokenize(self.line, settings.separator, &mut fields);
538        for selector in &settings.selectors {
539            let mut selection = selector.get_range(self.line, Some(&fields));
540            match selector.settings.mode {
541                SortMode::Numeric | SortMode::HumanNumeric => {
542                    // find out which range is used for numeric comparisons
543                    let (_, num_range) = NumInfo::parse(
544                        &self.line[selection.clone()],
545                        &NumInfoParseSettings {
546                            accept_si_units: selector.settings.mode == SortMode::HumanNumeric,
547                            ..Default::default()
548                        },
549                    );
550                    let initial_selection = selection.clone();
551
552                    // Shorten selection to num_range.
553                    selection.start += num_range.start;
554                    selection.end = selection.start + num_range.len();
555
556                    if num_range == (0..0) {
557                        // This was not a valid number.
558                        // Report no match at the first non-whitespace character.
559                        let leading_whitespace = self.line[selection.clone()]
560                            .iter()
561                            .position(|c| !c.is_ascii_whitespace())
562                            .unwrap_or(0);
563                        selection.start += leading_whitespace;
564                        selection.end += leading_whitespace;
565                    } else {
566                        // include a trailing si unit
567                        if selector.settings.mode == SortMode::HumanNumeric {
568                            if let Some(
569                                b'k' | b'K' | b'M' | b'G' | b'T' | b'P' | b'E' | b'Z' | b'Y' | b'R'
570                                | b'Q',
571                            ) = self.line[selection.end..initial_selection.end].first()
572                            {
573                                selection.end += 1;
574                            }
575                        }
576
577                        // include leading zeroes, a leading minus or a leading decimal point
578                        while let Some(b'-' | b'0' | b'.') =
579                            self.line[initial_selection.start..selection.start].last()
580                        {
581                            selection.start -= 1;
582                        }
583                    }
584                }
585                SortMode::GeneralNumeric => {
586                    let initial_selection = &self.line[selection.clone()];
587
588                    let leading = get_leading_gen(initial_selection);
589
590                    // Shorten selection to leading.
591                    selection.start += leading.start;
592                    selection.end = selection.start + leading.len();
593                }
594                SortMode::Month => {
595                    let initial_selection = &self.line[selection.clone()];
596
597                    let mut month_chars = initial_selection
598                        .iter()
599                        .enumerate()
600                        .skip_while(|(_, c)| c.is_ascii_whitespace());
601
602                    let month = if month_parse(initial_selection) == Month::Unknown {
603                        // We failed to parse a month, which is equivalent to matching nothing.
604                        // Add the "no match for key" marker to the first non-whitespace character.
605                        let first_non_whitespace = month_chars.next();
606                        first_non_whitespace.map_or(
607                            initial_selection.len()..initial_selection.len(),
608                            |(idx, _)| idx..idx,
609                        )
610                    } else {
611                        // We parsed a month. Match the first three non-whitespace characters, which must be the month we parsed.
612                        month_chars.next().unwrap().0
613                            ..month_chars
614                                .nth(2)
615                                .map_or(initial_selection.len(), |(idx, _)| idx)
616                    };
617
618                    // Shorten selection to month.
619                    selection.start += month.start;
620                    selection.end = selection.start + month.len();
621                }
622                _ => {}
623            }
624
625            let select = &line[..selection.start];
626            write!(writer, "{}", " ".repeat(select.len()))?;
627
628            if selection.is_empty() {
629                writeln!(writer, "{}", translate!("sort-error-no-match-for-key"))?;
630            } else {
631                let select = &line[selection];
632                writeln!(writer, "{}", "_".repeat(select.len()))?;
633            }
634        }
635
636        if settings.mode != SortMode::Random
637            && !settings.stable
638            && !settings.unique
639            && (settings.dictionary_order
640                || settings.ignore_leading_blanks
641                || settings.ignore_case
642                || settings.ignore_non_printing
643                || settings.mode != SortMode::Default
644                || settings
645                    .selectors
646                    .last()
647                    .is_none_or(|selector| selector != &FieldSelector::default()))
648        {
649            // A last resort comparator is in use, underline the whole line.
650            if self.line.is_empty() {
651                writeln!(writer, "{}", translate!("sort-error-no-match-for-key"))?;
652            } else {
653                writeln!(writer, "{}", "_".repeat(self.line.len()))?;
654            }
655        }
656        Ok(())
657    }
658}
659
660/// Tokenize a line into fields. The result is stored into `token_buffer`.
661fn tokenize(line: &[u8], separator: Option<u8>, token_buffer: &mut Vec<Field>) {
662    assert!(token_buffer.is_empty());
663    if let Some(separator) = separator {
664        tokenize_with_separator(line, separator, token_buffer);
665    } else {
666        tokenize_default(line, token_buffer);
667    }
668}
669
670/// By default fields are separated by the first whitespace after non-whitespace.
671/// Whitespace is included in fields at the start.
672/// The result is stored into `token_buffer`.
673fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
674    token_buffer.push(0..0);
675    // pretend that there was whitespace in front of the line
676    let mut previous_was_whitespace = true;
677    for (idx, char) in line.iter().enumerate() {
678        if char.is_ascii_whitespace() {
679            if !previous_was_whitespace {
680                token_buffer.last_mut().unwrap().end = idx;
681                token_buffer.push(idx..0);
682            }
683            previous_was_whitespace = true;
684        } else {
685            previous_was_whitespace = false;
686        }
687    }
688    token_buffer.last_mut().unwrap().end = line.len();
689}
690
691/// Split between separators. These separators are not included in fields.
692/// The result is stored into `token_buffer`.
693fn tokenize_with_separator(line: &[u8], separator: u8, token_buffer: &mut Vec<Field>) {
694    let separator_indices = line
695        .iter()
696        .enumerate()
697        .filter_map(|(i, &c)| if c == separator { Some(i) } else { None });
698    let mut start = 0;
699    for sep_idx in separator_indices {
700        token_buffer.push(start..sep_idx);
701        start = sep_idx + 1;
702    }
703    if start < line.len() {
704        token_buffer.push(start..line.len());
705    }
706}
707
708#[derive(Clone, PartialEq, Debug)]
709struct KeyPosition {
710    /// 1-indexed, 0 is invalid.
711    field: usize,
712    /// 1-indexed, 0 is end of field.
713    char: usize,
714    ignore_blanks: bool,
715}
716
717impl KeyPosition {
718    fn new(key: &str, default_char_index: usize, ignore_blanks: bool) -> Result<Self, String> {
719        let mut field_and_char = key.split('.');
720
721        let field = field_and_char
722            .next()
723            .ok_or_else(|| translate!("sort-invalid-key", "key" => key.quote()))?;
724        let char = field_and_char.next();
725
726        let field = match field.parse::<usize>() {
727            Ok(f) => f,
728            Err(e) if *e.kind() == IntErrorKind::PosOverflow => usize::MAX,
729            Err(e) => {
730                return Err(
731                    translate!("sort-failed-parse-field-index", "field" => field.quote(), "error" => e),
732                );
733            }
734        };
735        if field == 0 {
736            return Err(translate!("sort-field-index-cannot-be-zero"));
737        }
738
739        let char = char.map_or(Ok(default_char_index), |char| {
740            char.parse().map_err(|e: std::num::ParseIntError| {
741                translate!("sort-failed-parse-char-index", "char" => char.quote(), "error" => e)
742            })
743        })?;
744
745        Ok(Self {
746            field,
747            char,
748            ignore_blanks,
749        })
750    }
751}
752
753impl Default for KeyPosition {
754    fn default() -> Self {
755        Self {
756            field: 1,
757            char: 1,
758            ignore_blanks: false,
759        }
760    }
761}
762
763#[derive(Clone, PartialEq, Debug, Default)]
764struct FieldSelector {
765    from: KeyPosition,
766    to: Option<KeyPosition>,
767    settings: KeySettings,
768    needs_tokens: bool,
769    // Whether this selector operates on a sub-slice of a line.
770    // Selections are therefore not needed when this selector matches the whole line
771    // or the sort mode is general-numeric.
772    needs_selection: bool,
773}
774
775impl FieldSelector {
776    /// Splits this position into the actual position and the attached options.
777    fn split_key_options(position: &str) -> (&str, &str) {
778        if let Some((options_start, _)) = position.char_indices().find(|(_, c)| c.is_alphabetic()) {
779            position.split_at(options_start)
780        } else {
781            (position, "")
782        }
783    }
784
785    fn parse(key: &str, global_settings: &GlobalSettings) -> UResult<Self> {
786        let mut from_to = key.split(',');
787        let (from, from_options) = Self::split_key_options(from_to.next().unwrap());
788        let to = from_to.next().map(Self::split_key_options);
789        let options_are_empty = from_options.is_empty() && matches!(to, None | Some((_, "")));
790
791        if options_are_empty {
792            // Inherit the global settings if there are no options attached to this key.
793            (|| {
794                // This would be ideal for a try block, I think. In the meantime this closure allows
795                // to use the `?` operator here.
796                Self::new(
797                    KeyPosition::new(from, 1, global_settings.ignore_leading_blanks)?,
798                    to.map(|(to, _)| {
799                        KeyPosition::new(to, 0, global_settings.ignore_leading_blanks)
800                    })
801                    .transpose()?,
802                    KeySettings::from(global_settings),
803                )
804            })()
805        } else {
806            // Do not inherit from `global_settings`, as there are options attached to this key.
807            Self::parse_with_options((from, from_options), to)
808        }
809        .map_err(|msg| {
810            SortError::ParseKeyError {
811                key: key.to_owned(),
812                msg,
813            }
814            .into()
815        })
816    }
817
818    fn parse_with_options(
819        (from, from_options): (&str, &str),
820        to: Option<(&str, &str)>,
821    ) -> Result<Self, String> {
822        /// Applies `options` to `key_settings`, returning if the 'b'-flag (ignore blanks) was present.
823        fn parse_key_settings(
824            options: &str,
825            key_settings: &mut KeySettings,
826        ) -> Result<bool, String> {
827            let mut ignore_blanks = false;
828            for option in options.chars() {
829                match option {
830                    'M' => key_settings.set_sort_mode(SortMode::Month)?,
831                    'b' => ignore_blanks = true,
832                    'd' => key_settings.set_dictionary_order()?,
833                    'f' => key_settings.ignore_case = true,
834                    'g' => key_settings.set_sort_mode(SortMode::GeneralNumeric)?,
835                    'h' => key_settings.set_sort_mode(SortMode::HumanNumeric)?,
836                    'i' => key_settings.set_ignore_non_printing()?,
837                    'n' => key_settings.set_sort_mode(SortMode::Numeric)?,
838                    'R' => key_settings.set_sort_mode(SortMode::Random)?,
839                    'r' => key_settings.reverse = true,
840                    'V' => key_settings.set_sort_mode(SortMode::Version)?,
841                    c => {
842                        return Err(translate!("sort-invalid-option", "option" => c));
843                    }
844                }
845            }
846            Ok(ignore_blanks)
847        }
848
849        let mut key_settings = KeySettings::default();
850        let from = parse_key_settings(from_options, &mut key_settings)
851            .map(|ignore_blanks| KeyPosition::new(from, 1, ignore_blanks))??;
852        let to = if let Some((to, to_options)) = to {
853            Some(
854                parse_key_settings(to_options, &mut key_settings)
855                    .map(|ignore_blanks| KeyPosition::new(to, 0, ignore_blanks))??,
856            )
857        } else {
858            None
859        };
860        Self::new(from, to, key_settings)
861    }
862
863    fn new(
864        from: KeyPosition,
865        to: Option<KeyPosition>,
866        settings: KeySettings,
867    ) -> Result<Self, String> {
868        if from.char == 0 {
869            Err(translate!("sort-invalid-char-index-zero-start"))
870        } else {
871            Ok(Self {
872                needs_selection: (from.field != 1
873                    || from.char != 1
874                    || to.is_some()
875                    || matches!(settings.mode, SortMode::Numeric | SortMode::HumanNumeric)
876                    || from.ignore_blanks)
877                    && !matches!(settings.mode, SortMode::GeneralNumeric),
878                needs_tokens: from.field != 1 || from.char == 0 || to.is_some(),
879                from,
880                to,
881                settings,
882            })
883        }
884    }
885
886    /// Get the selection that corresponds to this selector for the line.
887    /// If `needs_fields` returned false, tokens may be empty.
888    fn get_selection<'a>(&self, line: &'a [u8], tokens: &[Field]) -> Selection<'a> {
889        // `get_range` expects `None` when we don't need tokens and would get confused by an empty vector.
890        let tokens = if self.needs_tokens {
891            Some(tokens)
892        } else {
893            None
894        };
895        let mut range_str = &line[self.get_range(line, tokens)];
896        if self.settings.mode == SortMode::Numeric || self.settings.mode == SortMode::HumanNumeric {
897            // Parse NumInfo for this number.
898            let (info, num_range) = NumInfo::parse(
899                range_str,
900                &NumInfoParseSettings {
901                    accept_si_units: self.settings.mode == SortMode::HumanNumeric,
902                    ..Default::default()
903                },
904            );
905            // Shorten the range to what we need to pass to numeric_str_cmp later.
906            range_str = &range_str[num_range];
907            Selection::WithNumInfo(range_str, info)
908        } else if self.settings.mode == SortMode::GeneralNumeric {
909            // Parse this number as BigDecimal, as this is the requirement for general numeric sorting.
910            Selection::AsBigDecimal(general_bd_parse(&range_str[get_leading_gen(range_str)]))
911        } else {
912            // This is not a numeric sort, so we don't need a NumCache.
913            Selection::Str(range_str)
914        }
915    }
916
917    /// Look up the range in the line that corresponds to this selector.
918    /// If `needs_fields` returned false, tokens must be None.
919    fn get_range(&self, line: &[u8], tokens: Option<&[Field]>) -> Range<usize> {
920        enum Resolution {
921            // The start index of the resolved character, inclusive
922            StartOfChar(usize),
923            // The end index of the resolved character, exclusive.
924            // This is only returned if the character index is 0.
925            EndOfChar(usize),
926            // The resolved character would be in front of the first character
927            TooLow,
928            // The resolved character would be after the last character
929            TooHigh,
930        }
931
932        /// Get the index for this line given the [`KeyPosition`]
933        fn resolve_index(
934            line: &[u8],
935            tokens: Option<&[Field]>,
936            position: &KeyPosition,
937        ) -> Resolution {
938            if matches!(tokens, Some(tokens) if tokens.len() < position.field) {
939                Resolution::TooHigh
940            } else if position.char == 0 {
941                let end = tokens.unwrap()[position.field - 1].end;
942                if end == 0 {
943                    Resolution::TooLow
944                } else {
945                    Resolution::EndOfChar(end)
946                }
947            } else {
948                let mut idx = if position.field == 1 {
949                    // The first field always starts at 0.
950                    // We don't need tokens for this case.
951                    0
952                } else {
953                    tokens.unwrap()[position.field - 1].start
954                };
955                // strip blanks if needed
956                if position.ignore_blanks {
957                    idx += line[idx..]
958                        .iter()
959                        .enumerate()
960                        .find(|(_, c)| !c.is_ascii_whitespace())
961                        .map_or(line[idx..].len(), |(idx, _)| idx);
962                }
963                // apply the character index
964                idx += line[idx..]
965                    .iter()
966                    .enumerate()
967                    .nth(position.char - 1)
968                    .map_or(line[idx..].len(), |(idx, _)| idx);
969                if idx >= line.len() {
970                    Resolution::TooHigh
971                } else {
972                    Resolution::StartOfChar(idx)
973                }
974            }
975        }
976
977        match resolve_index(line, tokens, &self.from) {
978            Resolution::StartOfChar(from) => {
979                let to = self.to.as_ref().map(|to| resolve_index(line, tokens, to));
980
981                let mut range = match to {
982                    Some(Resolution::StartOfChar(mut to)) => {
983                        // We need to include the character at `to`.
984                        to += 1;
985                        from..to
986                    }
987                    Some(Resolution::EndOfChar(to)) => from..to,
988                    // If `to` was not given or the match would be after the end of the line,
989                    // match everything until the end of the line.
990                    None | Some(Resolution::TooHigh) => from..line.len(),
991                    // If `to` is before the start of the line, report no match.
992                    // This can happen if the line starts with a separator.
993                    Some(Resolution::TooLow) => 0..0,
994                };
995                if range.start > range.end {
996                    range.end = range.start;
997                }
998                range
999            }
1000            Resolution::TooLow | Resolution::EndOfChar(_) => {
1001                unreachable!(
1002                    "This should only happen if the field start index is 0, but that should already have caused an error."
1003                )
1004            }
1005            // While for comparisons it's only important that this is an empty slice,
1006            // to produce accurate debug output we need to match an empty slice at the end of the line.
1007            Resolution::TooHigh => line.len()..line.len(),
1008        }
1009    }
1010}
1011
1012/// Creates an `Arg` that conflicts with all other sort modes.
1013fn make_sort_mode_arg(mode: &'static str, short: char, help: String) -> Arg {
1014    Arg::new(mode)
1015        .short(short)
1016        .long(mode)
1017        .help(help)
1018        .action(ArgAction::SetTrue)
1019        .conflicts_with_all(
1020            options::modes::ALL_SORT_MODES
1021                .iter()
1022                .filter(|&&m| m != mode),
1023        )
1024}
1025
1026#[cfg(target_os = "linux")]
1027fn get_rlimit() -> UResult<usize> {
1028    let mut limit = rlimit {
1029        rlim_cur: 0,
1030        rlim_max: 0,
1031    };
1032    match unsafe { getrlimit(RLIMIT_NOFILE, &raw mut limit) } {
1033        0 => Ok(limit.rlim_cur as usize),
1034        _ => Err(UUsageError::new(2, translate!("sort-failed-fetch-rlimit"))),
1035    }
1036}
1037
1038const STDIN_FILE: &str = "-";
1039
1040#[uucore::main]
1041#[allow(clippy::cognitive_complexity)]
1042pub fn uumain(args: impl uucore::Args) -> UResult<()> {
1043    let mut settings = GlobalSettings::default();
1044
1045    let matches = uucore::clap_localization::handle_clap_result_with_exit_code(uu_app(), args, 2)?;
1046
1047    // Prevent -o/--output to be specified multiple times
1048    if matches
1049        .get_occurrences::<OsString>(options::OUTPUT)
1050        .is_some_and(|out| out.len() > 1)
1051    {
1052        return Err(SortError::MultipleOutputFiles.into());
1053    }
1054
1055    settings.debug = matches.get_flag(options::DEBUG);
1056
1057    // check whether user specified a zero terminated list of files for input, otherwise read files from args
1058    let mut files: Vec<OsString> = if matches.contains_id(options::FILES0_FROM) {
1059        let files0_from: PathBuf = matches
1060            .get_one::<OsString>(options::FILES0_FROM)
1061            .map(|v| v.into())
1062            .unwrap_or_default();
1063
1064        // Cannot combine FILES with FILES0_FROM
1065        if let Some(s) = matches.get_one::<OsString>(options::FILES) {
1066            return Err(SortError::FileOperandsCombined { file: s.into() }.into());
1067        }
1068
1069        let mut files = Vec::new();
1070
1071        // sort errors with "cannot open: [...]" instead of "cannot read: [...]" here
1072        let reader = open_with_open_failed_error(&files0_from)?;
1073        let buf_reader = BufReader::new(reader);
1074        for (line_num, line) in buf_reader.split(b'\0').flatten().enumerate() {
1075            let f = std::str::from_utf8(&line)
1076                .expect("Could not parse string from zero terminated input.");
1077            match f {
1078                STDIN_FILE => {
1079                    return Err(SortError::MinusInStdIn.into());
1080                }
1081                "" => {
1082                    return Err(SortError::ZeroLengthFileName {
1083                        file: files0_from,
1084                        line_num: line_num + 1,
1085                    }
1086                    .into());
1087                }
1088                _ => {}
1089            }
1090
1091            files.push(OsString::from(
1092                std::str::from_utf8(&line)
1093                    .expect("Could not parse string from zero terminated input."),
1094            ));
1095        }
1096        if files.is_empty() {
1097            return Err(SortError::EmptyInputFile { file: files0_from }.into());
1098        }
1099        files
1100    } else {
1101        matches
1102            .get_many::<OsString>(options::FILES)
1103            .map(|v| v.map(ToOwned::to_owned).collect())
1104            .unwrap_or_default()
1105    };
1106
1107    settings.mode = if matches.get_flag(options::modes::HUMAN_NUMERIC)
1108        || matches
1109            .get_one::<String>(options::modes::SORT)
1110            .is_some_and(|s| s == "human-numeric")
1111    {
1112        SortMode::HumanNumeric
1113    } else if matches.get_flag(options::modes::MONTH)
1114        || matches
1115            .get_one::<String>(options::modes::SORT)
1116            .is_some_and(|s| s == "month")
1117    {
1118        SortMode::Month
1119    } else if matches.get_flag(options::modes::GENERAL_NUMERIC)
1120        || matches
1121            .get_one::<String>(options::modes::SORT)
1122            .is_some_and(|s| s == "general-numeric")
1123    {
1124        SortMode::GeneralNumeric
1125    } else if matches.get_flag(options::modes::NUMERIC)
1126        || matches
1127            .get_one::<String>(options::modes::SORT)
1128            .is_some_and(|s| s == "numeric")
1129    {
1130        SortMode::Numeric
1131    } else if matches.get_flag(options::modes::VERSION)
1132        || matches
1133            .get_one::<String>(options::modes::SORT)
1134            .is_some_and(|s| s == "version")
1135    {
1136        SortMode::Version
1137    } else if matches.get_flag(options::modes::RANDOM)
1138        || matches
1139            .get_one::<String>(options::modes::SORT)
1140            .is_some_and(|s| s == "random")
1141    {
1142        settings.salt = Some(get_rand_string());
1143        SortMode::Random
1144    } else {
1145        SortMode::Default
1146    };
1147
1148    settings.dictionary_order = matches.get_flag(options::DICTIONARY_ORDER);
1149    settings.ignore_non_printing = matches.get_flag(options::IGNORE_NONPRINTING);
1150    if matches.contains_id(options::PARALLEL) {
1151        // "0" is default - threads = num of cores
1152        settings.threads = matches
1153            .get_one::<String>(options::PARALLEL)
1154            .map_or_else(|| "0".to_string(), String::from);
1155        unsafe {
1156            env::set_var("RAYON_NUM_THREADS", &settings.threads);
1157        }
1158    }
1159
1160    settings.buffer_size =
1161        matches
1162            .get_one::<String>(options::BUF_SIZE)
1163            .map_or(Ok(DEFAULT_BUF_SIZE), |s| {
1164                GlobalSettings::parse_byte_count(s).map_err(|e| {
1165                    USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE))
1166                })
1167            })?;
1168
1169    let mut tmp_dir = TmpDirWrapper::new(
1170        matches
1171            .get_one::<String>(options::TMP_DIR)
1172            .map_or_else(env::temp_dir, PathBuf::from),
1173    );
1174
1175    settings.compress_prog = matches
1176        .get_one::<String>(options::COMPRESS_PROG)
1177        .map(String::from);
1178
1179    if let Some(n_merge) = matches.get_one::<String>(options::BATCH_SIZE) {
1180        match n_merge.parse::<usize>() {
1181            Ok(parsed_value) => {
1182                if parsed_value < 2 {
1183                    show_error!(
1184                        "{}",
1185                        translate!("sort-invalid-batch-size-arg", "arg" => n_merge)
1186                    );
1187                    return Err(UUsageError::new(
1188                        2,
1189                        translate!("sort-minimum-batch-size-two"),
1190                    ));
1191                }
1192                settings.merge_batch_size = parsed_value;
1193            }
1194            Err(e) => {
1195                let error_message = if *e.kind() == IntErrorKind::PosOverflow {
1196                    let batch_too_large = translate!(
1197                        "sort-batch-size-too-large",
1198                        "arg" => n_merge.quote()
1199                    );
1200
1201                    #[cfg(target_os = "linux")]
1202                    {
1203                        show_error!("{}", batch_too_large);
1204
1205                        translate!(
1206                            "sort-maximum-batch-size-rlimit",
1207                            "rlimit" =>  get_rlimit()?
1208                        )
1209                    }
1210                    #[cfg(not(target_os = "linux"))]
1211                    {
1212                        batch_too_large
1213                    }
1214                } else {
1215                    translate!(
1216                        "sort-invalid-batch-size-arg",
1217                        "arg" =>  n_merge,
1218                    )
1219                };
1220
1221                return Err(UUsageError::new(2, error_message));
1222            }
1223        }
1224    }
1225
1226    settings.line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED));
1227    settings.merge = matches.get_flag(options::MERGE);
1228
1229    settings.check = matches.contains_id(options::check::CHECK);
1230    if matches.get_flag(options::check::CHECK_SILENT)
1231        || matches!(
1232            matches
1233                .get_one::<String>(options::check::CHECK)
1234                .map(|s| s.as_str()),
1235            Some(options::check::SILENT | options::check::QUIET)
1236        )
1237    {
1238        settings.check_silent = true;
1239        settings.check = true;
1240    }
1241
1242    settings.ignore_case = matches.get_flag(options::IGNORE_CASE);
1243
1244    settings.ignore_leading_blanks = matches.get_flag(options::IGNORE_LEADING_BLANKS);
1245
1246    settings.reverse = matches.get_flag(options::REVERSE);
1247    settings.stable = matches.get_flag(options::STABLE);
1248    settings.unique = matches.get_flag(options::UNIQUE);
1249
1250    if files.is_empty() {
1251        /* if no file, default to stdin */
1252        files.push(OsString::from(STDIN_FILE));
1253    } else if settings.check && files.len() != 1 {
1254        return Err(UUsageError::new(
1255            2,
1256            translate!("sort-extra-operand-not-allowed-with-c", "operand" => files[1].quote()),
1257        ));
1258    }
1259
1260    if let Some(arg) = matches.get_one::<OsString>(options::SEPARATOR) {
1261        let mut separator = arg.to_str().ok_or_else(|| {
1262            UUsageError::new(
1263                2,
1264                translate!("sort-separator-not-valid-unicode", "arg" => arg.quote()),
1265            )
1266        })?;
1267        if separator == "\\0" {
1268            separator = "\0";
1269        }
1270        // This rejects non-ASCII codepoints, but perhaps we don't have to.
1271        // On the other hand GNU accepts any single byte, valid unicode or not.
1272        // (Supporting multi-byte chars would require changes in tokenize_with_separator().)
1273        let &[sep_char] = separator.as_bytes() else {
1274            return Err(UUsageError::new(
1275                2,
1276                translate!("sort-separator-must-be-one-char", "separator" => separator.quote()),
1277            ));
1278        };
1279        settings.separator = Some(sep_char);
1280    }
1281
1282    if let Some(values) = matches.get_many::<String>(options::KEY) {
1283        for value in values {
1284            let selector = FieldSelector::parse(value, &settings)?;
1285            if selector.settings.mode == SortMode::Random && settings.salt.is_none() {
1286                settings.salt = Some(get_rand_string());
1287            }
1288            settings.selectors.push(selector);
1289        }
1290    }
1291
1292    if !matches.contains_id(options::KEY) {
1293        // add a default selector matching the whole line
1294        let key_settings = KeySettings::from(&settings);
1295        settings.selectors.push(
1296            FieldSelector::new(
1297                KeyPosition {
1298                    field: 1,
1299                    char: 1,
1300                    ignore_blanks: key_settings.ignore_blanks,
1301                },
1302                None,
1303                key_settings,
1304            )
1305            .unwrap(),
1306        );
1307    }
1308
1309    // Verify that we can open all input files.
1310    // It is the correct behavior to close all files afterwards,
1311    // and to reopen them at a later point. This is different from how the output file is handled,
1312    // probably to prevent running out of file descriptors.
1313    for file in &files {
1314        open(file)?;
1315    }
1316
1317    let output = Output::new(matches.get_one::<OsString>(options::OUTPUT))?;
1318
1319    settings.init_precomputed();
1320
1321    let result = exec(&mut files, &settings, output, &mut tmp_dir);
1322    // Wait here if `SIGINT` was received,
1323    // for signal handler to do its work and terminate the program.
1324    tmp_dir.wait_if_signal();
1325    result
1326}
1327
1328pub fn uu_app() -> Command {
1329    uucore::clap_localization::configure_localized_command(
1330        Command::new(uucore::util_name())
1331            .version(uucore::crate_version!())
1332            .about(translate!("sort-about"))
1333            .after_help(translate!("sort-after-help"))
1334            .override_usage(format_usage(&translate!("sort-usage"))),
1335    )
1336    .infer_long_args(true)
1337    .disable_help_flag(true)
1338    .disable_version_flag(true)
1339    .args_override_self(true)
1340    .arg(
1341        Arg::new(options::HELP)
1342            .long(options::HELP)
1343            .help(translate!("sort-help-help"))
1344            .action(ArgAction::Help),
1345    )
1346    .arg(
1347        Arg::new(options::VERSION)
1348            .long(options::VERSION)
1349            .help(translate!("sort-help-version"))
1350            .action(ArgAction::Version),
1351    )
1352    .arg(
1353        Arg::new(options::modes::SORT)
1354            .long(options::modes::SORT)
1355            .value_parser(ShortcutValueParser::new([
1356                "general-numeric",
1357                "human-numeric",
1358                "month",
1359                "numeric",
1360                "version",
1361                "random",
1362            ]))
1363            .conflicts_with_all(options::modes::ALL_SORT_MODES),
1364    )
1365    .arg(make_sort_mode_arg(
1366        options::modes::HUMAN_NUMERIC,
1367        'h',
1368        translate!("sort-help-human-numeric"),
1369    ))
1370    .arg(make_sort_mode_arg(
1371        options::modes::MONTH,
1372        'M',
1373        translate!("sort-help-month"),
1374    ))
1375    .arg(make_sort_mode_arg(
1376        options::modes::NUMERIC,
1377        'n',
1378        translate!("sort-help-numeric"),
1379    ))
1380    .arg(make_sort_mode_arg(
1381        options::modes::GENERAL_NUMERIC,
1382        'g',
1383        translate!("sort-help-general-numeric"),
1384    ))
1385    .arg(make_sort_mode_arg(
1386        options::modes::VERSION,
1387        'V',
1388        translate!("sort-help-version-sort"),
1389    ))
1390    .arg(make_sort_mode_arg(
1391        options::modes::RANDOM,
1392        'R',
1393        translate!("sort-help-random"),
1394    ))
1395    .arg(
1396        Arg::new(options::DICTIONARY_ORDER)
1397            .short('d')
1398            .long(options::DICTIONARY_ORDER)
1399            .help(translate!("sort-help-dictionary-order"))
1400            .conflicts_with_all([
1401                options::modes::NUMERIC,
1402                options::modes::GENERAL_NUMERIC,
1403                options::modes::HUMAN_NUMERIC,
1404                options::modes::MONTH,
1405            ])
1406            .action(ArgAction::SetTrue),
1407    )
1408    .arg(
1409        Arg::new(options::MERGE)
1410            .short('m')
1411            .long(options::MERGE)
1412            .help(translate!("sort-help-merge"))
1413            .action(ArgAction::SetTrue),
1414    )
1415    .arg(
1416        Arg::new(options::check::CHECK)
1417            .short('c')
1418            .long(options::check::CHECK)
1419            .require_equals(true)
1420            .num_args(0..)
1421            .value_parser(ShortcutValueParser::new([
1422                options::check::SILENT,
1423                options::check::QUIET,
1424                options::check::DIAGNOSE_FIRST,
1425            ]))
1426            .conflicts_with_all([options::OUTPUT, options::check::CHECK_SILENT])
1427            .help(translate!("sort-help-check")),
1428    )
1429    .arg(
1430        Arg::new(options::check::CHECK_SILENT)
1431            .short('C')
1432            .long(options::check::CHECK_SILENT)
1433            .conflicts_with_all([options::OUTPUT, options::check::CHECK])
1434            .help(translate!("sort-help-check-silent"))
1435            .action(ArgAction::SetTrue),
1436    )
1437    .arg(
1438        Arg::new(options::IGNORE_CASE)
1439            .short('f')
1440            .long(options::IGNORE_CASE)
1441            .help(translate!("sort-help-ignore-case"))
1442            .action(ArgAction::SetTrue),
1443    )
1444    .arg(
1445        Arg::new(options::IGNORE_NONPRINTING)
1446            .short('i')
1447            .long(options::IGNORE_NONPRINTING)
1448            .help(translate!("sort-help-ignore-nonprinting"))
1449            .conflicts_with_all([
1450                options::modes::NUMERIC,
1451                options::modes::GENERAL_NUMERIC,
1452                options::modes::HUMAN_NUMERIC,
1453                options::modes::MONTH,
1454            ])
1455            .action(ArgAction::SetTrue),
1456    )
1457    .arg(
1458        Arg::new(options::IGNORE_LEADING_BLANKS)
1459            .short('b')
1460            .long(options::IGNORE_LEADING_BLANKS)
1461            .help(translate!("sort-help-ignore-leading-blanks"))
1462            .action(ArgAction::SetTrue),
1463    )
1464    .arg(
1465        Arg::new(options::OUTPUT)
1466            .short('o')
1467            .long(options::OUTPUT)
1468            .help(translate!("sort-help-output"))
1469            .value_parser(ValueParser::os_string())
1470            .value_name("FILENAME")
1471            .value_hint(clap::ValueHint::FilePath)
1472            .num_args(1)
1473            .allow_hyphen_values(true)
1474            // To detect multiple occurrences and raise an error
1475            .action(ArgAction::Append),
1476    )
1477    .arg(
1478        Arg::new(options::REVERSE)
1479            .short('r')
1480            .long(options::REVERSE)
1481            .help(translate!("sort-help-reverse"))
1482            .action(ArgAction::SetTrue),
1483    )
1484    .arg(
1485        Arg::new(options::STABLE)
1486            .short('s')
1487            .long(options::STABLE)
1488            .help(translate!("sort-help-stable"))
1489            .action(ArgAction::SetTrue),
1490    )
1491    .arg(
1492        Arg::new(options::UNIQUE)
1493            .short('u')
1494            .long(options::UNIQUE)
1495            .help(translate!("sort-help-unique"))
1496            .action(ArgAction::SetTrue),
1497    )
1498    .arg(
1499        Arg::new(options::KEY)
1500            .short('k')
1501            .long(options::KEY)
1502            .help(translate!("sort-help-key"))
1503            .action(ArgAction::Append)
1504            .num_args(1),
1505    )
1506    .arg(
1507        Arg::new(options::SEPARATOR)
1508            .short('t')
1509            .long(options::SEPARATOR)
1510            .help(translate!("sort-help-separator"))
1511            .value_parser(ValueParser::os_string()),
1512    )
1513    .arg(
1514        Arg::new(options::ZERO_TERMINATED)
1515            .short('z')
1516            .long(options::ZERO_TERMINATED)
1517            .help(translate!("sort-help-zero-terminated"))
1518            .action(ArgAction::SetTrue),
1519    )
1520    .arg(
1521        Arg::new(options::PARALLEL)
1522            .long(options::PARALLEL)
1523            .help(translate!("sort-help-parallel"))
1524            .value_name("NUM_THREADS"),
1525    )
1526    .arg(
1527        Arg::new(options::BUF_SIZE)
1528            .short('S')
1529            .long(options::BUF_SIZE)
1530            .help(translate!("sort-help-buf-size"))
1531            .value_name("SIZE"),
1532    )
1533    .arg(
1534        Arg::new(options::TMP_DIR)
1535            .short('T')
1536            .long(options::TMP_DIR)
1537            .help(translate!("sort-help-tmp-dir"))
1538            .value_name("DIR")
1539            .value_hint(clap::ValueHint::DirPath),
1540    )
1541    .arg(
1542        Arg::new(options::COMPRESS_PROG)
1543            .long(options::COMPRESS_PROG)
1544            .help(translate!("sort-help-compress-prog"))
1545            .value_name("PROG")
1546            .value_hint(clap::ValueHint::CommandName),
1547    )
1548    .arg(
1549        Arg::new(options::BATCH_SIZE)
1550            .long(options::BATCH_SIZE)
1551            .help(translate!("sort-help-batch-size"))
1552            .value_name("N_MERGE"),
1553    )
1554    .arg(
1555        Arg::new(options::FILES0_FROM)
1556            .long(options::FILES0_FROM)
1557            .help(translate!("sort-help-files0-from"))
1558            .value_name("NUL_FILE")
1559            .value_parser(ValueParser::os_string())
1560            .value_hint(clap::ValueHint::FilePath),
1561    )
1562    .arg(
1563        Arg::new(options::DEBUG)
1564            .long(options::DEBUG)
1565            .help(translate!("sort-help-debug"))
1566            .action(ArgAction::SetTrue),
1567    )
1568    .arg(
1569        Arg::new(options::FILES)
1570            .action(ArgAction::Append)
1571            .value_parser(ValueParser::os_string())
1572            .value_hint(clap::ValueHint::FilePath),
1573    )
1574}
1575
1576fn exec(
1577    files: &mut [OsString],
1578    settings: &GlobalSettings,
1579    output: Output,
1580    tmp_dir: &mut TmpDirWrapper,
1581) -> UResult<()> {
1582    if settings.merge {
1583        merge::merge(files, settings, output, tmp_dir)
1584    } else if settings.check {
1585        if files.len() > 1 {
1586            Err(UUsageError::new(
1587                2,
1588                translate!("sort-only-one-file-allowed-with-c"),
1589            ))
1590        } else {
1591            check::check(files.first().unwrap(), settings)
1592        }
1593    } else {
1594        let mut lines = files.iter().map(open);
1595        ext_sort(&mut lines, settings, output, tmp_dir)
1596    }
1597}
1598
1599fn sort_by<'a>(unsorted: &mut Vec<Line<'a>>, settings: &GlobalSettings, line_data: &LineData<'a>) {
1600    if settings.stable || settings.unique {
1601        unsorted.par_sort_by(|a, b| compare_by(a, b, settings, line_data, line_data));
1602    } else {
1603        unsorted.par_sort_unstable_by(|a, b| compare_by(a, b, settings, line_data, line_data));
1604    }
1605}
1606
1607fn compare_by<'a>(
1608    a: &Line<'a>,
1609    b: &Line<'a>,
1610    global_settings: &GlobalSettings,
1611    a_line_data: &LineData<'a>,
1612    b_line_data: &LineData<'a>,
1613) -> Ordering {
1614    let mut selection_index = 0;
1615    let mut num_info_index = 0;
1616    let mut parsed_float_index = 0;
1617
1618    if let (Some(Some(a_f64)), Some(Some(b_f64))) = (
1619        a_line_data.line_num_floats.get(a.index),
1620        b_line_data.line_num_floats.get(b.index),
1621    ) {
1622        // we don't use total_cmp() because it always sorts -0 before 0
1623        if let Some(cmp) = a_f64.partial_cmp(b_f64) {
1624            // don't trust `Ordering::Equal` if lines are not fully equal
1625            if cmp != Ordering::Equal || a.line == b.line {
1626                return if global_settings.reverse {
1627                    cmp.reverse()
1628                } else {
1629                    cmp
1630                };
1631            }
1632        }
1633    }
1634
1635    for selector in &global_settings.selectors {
1636        let (a_str, b_str) = if selector.needs_selection {
1637            let selections = (
1638                a_line_data.selections
1639                    [a.index * global_settings.precomputed.selections_per_line + selection_index],
1640                b_line_data.selections
1641                    [b.index * global_settings.precomputed.selections_per_line + selection_index],
1642            );
1643            selection_index += 1;
1644            selections
1645        } else {
1646            // We can select the whole line.
1647            (a.line, b.line)
1648        };
1649
1650        let settings = &selector.settings;
1651
1652        let cmp: Ordering = match settings.mode {
1653            SortMode::Random => {
1654                // check if the two strings are equal
1655                if custom_str_cmp(
1656                    a_str,
1657                    b_str,
1658                    settings.ignore_non_printing,
1659                    settings.dictionary_order,
1660                    settings.ignore_case,
1661                ) == Ordering::Equal
1662                {
1663                    Ordering::Equal
1664                } else {
1665                    // Only if they are not equal compare by the hash
1666                    random_shuffle(a_str, b_str, &global_settings.salt.unwrap())
1667                }
1668            }
1669            SortMode::Numeric => {
1670                let a_num_info = &a_line_data.num_infos
1671                    [a.index * global_settings.precomputed.num_infos_per_line + num_info_index];
1672                let b_num_info = &b_line_data.num_infos
1673                    [b.index * global_settings.precomputed.num_infos_per_line + num_info_index];
1674                num_info_index += 1;
1675                numeric_str_cmp((a_str, a_num_info), (b_str, b_num_info))
1676            }
1677            SortMode::HumanNumeric => {
1678                let a_num_info = &a_line_data.num_infos
1679                    [a.index * global_settings.precomputed.num_infos_per_line + num_info_index];
1680                let b_num_info = &b_line_data.num_infos
1681                    [b.index * global_settings.precomputed.num_infos_per_line + num_info_index];
1682                num_info_index += 1;
1683                human_numeric_str_cmp((a_str, a_num_info), (b_str, b_num_info))
1684            }
1685            SortMode::GeneralNumeric => {
1686                let a_float = &a_line_data.parsed_floats
1687                    [a.index * global_settings.precomputed.floats_per_line + parsed_float_index];
1688                let b_float = &b_line_data.parsed_floats
1689                    [b.index * global_settings.precomputed.floats_per_line + parsed_float_index];
1690                parsed_float_index += 1;
1691                general_numeric_compare(a_float, b_float)
1692            }
1693            SortMode::Month => month_compare(a_str, b_str),
1694            SortMode::Version => version_cmp(a_str, b_str),
1695            SortMode::Default => custom_str_cmp(
1696                a_str,
1697                b_str,
1698                settings.ignore_non_printing,
1699                settings.dictionary_order,
1700                settings.ignore_case,
1701            ),
1702        };
1703        if cmp != Ordering::Equal {
1704            return if settings.reverse { cmp.reverse() } else { cmp };
1705        }
1706    }
1707
1708    // Call "last resort compare" if all selectors returned Equal
1709    let cmp = if global_settings.mode == SortMode::Random
1710        || global_settings.stable
1711        || global_settings.unique
1712    {
1713        Ordering::Equal
1714    } else {
1715        a.line.cmp(b.line)
1716    };
1717
1718    if global_settings.reverse {
1719        cmp.reverse()
1720    } else {
1721        cmp
1722    }
1723}
1724
1725// This function cleans up the initial comparison done by leading_num_common for a general numeric compare.
1726// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
1727// scientific notation, so we strip those lines only after the end of the following numeric string.
1728// For example, 5e10KFD would be 5e10 or 5x10^10 and +10000HFKJFK would become 10000.
1729#[allow(clippy::cognitive_complexity)]
1730fn get_leading_gen(inp: &[u8]) -> Range<usize> {
1731    let trimmed = inp.trim_ascii_start();
1732    let leading_whitespace_len = inp.len() - trimmed.len();
1733
1734    // check for inf, -inf and nan
1735    const ALLOWED_PREFIXES: &[&[u8]] = &[b"inf", b"-inf", b"nan"];
1736    for &allowed_prefix in ALLOWED_PREFIXES {
1737        if trimmed.len() >= allowed_prefix.len()
1738            && trimmed[..allowed_prefix.len()].eq_ignore_ascii_case(allowed_prefix)
1739        {
1740            return leading_whitespace_len..(leading_whitespace_len + allowed_prefix.len());
1741        }
1742    }
1743    // Make this iter peekable to see if next char is numeric
1744    let mut char_indices = itertools::peek_nth(trimmed.iter().enumerate());
1745
1746    let first = char_indices.peek();
1747
1748    if matches!(first, Some((_, NEGATIVE | POSITIVE))) {
1749        char_indices.next();
1750    }
1751
1752    let mut had_e_notation = false;
1753    let mut had_decimal_pt = false;
1754    let mut had_hex_notation: bool = false;
1755    while let Some((idx, &c)) = char_indices.next() {
1756        if had_hex_notation && c.is_ascii_hexdigit() {
1757            continue;
1758        }
1759
1760        if c.is_ascii_digit() {
1761            if c == b'0' && matches!(char_indices.peek(), Some((_, b'x' | b'X'))) {
1762                had_hex_notation = true;
1763                char_indices.next();
1764            }
1765            continue;
1766        }
1767
1768        if c == DECIMAL_PT && !had_decimal_pt && !had_e_notation {
1769            had_decimal_pt = true;
1770            continue;
1771        }
1772        let is_decimal_e = (c == b'e' || c == b'E') && !had_hex_notation;
1773        let is_hex_e = (c == b'p' || c == b'P') && had_hex_notation;
1774        if (is_decimal_e || is_hex_e) && !had_e_notation {
1775            // we can only consume the 'e' if what follow is either a digit, or a sign followed by a digit.
1776            if let Some(&(_, &next_char)) = char_indices.peek() {
1777                if (next_char == b'+' || next_char == b'-')
1778                    && matches!(
1779                        char_indices.peek_nth(2),
1780                        Some((_, c)) if c.is_ascii_digit()
1781                    )
1782                {
1783                    // Consume the sign. The following digits will be consumed by the main loop.
1784                    char_indices.next();
1785                    had_e_notation = true;
1786                    continue;
1787                }
1788                if next_char.is_ascii_digit() {
1789                    had_e_notation = true;
1790                    continue;
1791                }
1792            }
1793        }
1794        return leading_whitespace_len..(leading_whitespace_len + idx);
1795    }
1796    leading_whitespace_len..inp.len()
1797}
1798
1799#[derive(Clone, PartialEq, PartialOrd, Debug)]
1800pub enum GeneralBigDecimalParseResult {
1801    Invalid,
1802    Nan,
1803    MinusInfinity,
1804    Number(BigDecimal),
1805    Infinity,
1806}
1807
1808/// Parse the beginning string into a [`GeneralBigDecimalParseResult`].
1809/// Using a [`GeneralBigDecimalParseResult`] instead of [`ExtendedBigDecimal`] is necessary to correctly order floats.
1810#[inline(always)]
1811fn general_bd_parse(a: &[u8]) -> GeneralBigDecimalParseResult {
1812    // The string should be valid ASCII to be parsed.
1813    let Ok(a) = std::str::from_utf8(a) else {
1814        return GeneralBigDecimalParseResult::Invalid;
1815    };
1816
1817    // Parse digits, and fold in recoverable errors
1818    let ebd = match ExtendedBigDecimal::extended_parse(a) {
1819        Err(ExtendedParserError::NotNumeric) => return GeneralBigDecimalParseResult::Invalid,
1820        Err(
1821            ExtendedParserError::PartialMatch(ebd, _)
1822            | ExtendedParserError::Overflow(ebd)
1823            | ExtendedParserError::Underflow(ebd),
1824        )
1825        | Ok(ebd) => ebd,
1826    };
1827
1828    match ebd {
1829        ExtendedBigDecimal::BigDecimal(bd) => GeneralBigDecimalParseResult::Number(bd),
1830        ExtendedBigDecimal::Infinity => GeneralBigDecimalParseResult::Infinity,
1831        ExtendedBigDecimal::MinusInfinity => GeneralBigDecimalParseResult::MinusInfinity,
1832        // Minus zero and zero are equal
1833        ExtendedBigDecimal::MinusZero => GeneralBigDecimalParseResult::Number(0.into()),
1834        ExtendedBigDecimal::Nan | ExtendedBigDecimal::MinusNan => GeneralBigDecimalParseResult::Nan,
1835    }
1836}
1837
1838/// Compares two floats, with errors and non-numerics assumed to be -inf.
1839/// Stops coercing at the first non-numeric char.
1840/// We explicitly need to convert to f64 in this case.
1841fn general_numeric_compare(
1842    a: &GeneralBigDecimalParseResult,
1843    b: &GeneralBigDecimalParseResult,
1844) -> Ordering {
1845    a.partial_cmp(b).unwrap()
1846}
1847
1848fn get_rand_string() -> [u8; 16] {
1849    rng().sample(rand::distr::StandardUniform)
1850}
1851
1852fn get_hash<T: Hash>(t: &T) -> u64 {
1853    let mut s = FnvHasher::default();
1854    t.hash(&mut s);
1855    s.finish()
1856}
1857
1858fn random_shuffle(a: &[u8], b: &[u8], salt: &[u8]) -> Ordering {
1859    let da = get_hash(&(a, salt));
1860    let db = get_hash(&(b, salt));
1861    da.cmp(&db)
1862}
1863
1864#[derive(Eq, Ord, PartialEq, PartialOrd, Clone, Copy)]
1865enum Month {
1866    Unknown,
1867    January,
1868    February,
1869    March,
1870    April,
1871    May,
1872    June,
1873    July,
1874    August,
1875    September,
1876    October,
1877    November,
1878    December,
1879}
1880
1881/// Parse the beginning string into a Month, returning [`Month::Unknown`] on errors.
1882fn month_parse(line: &[u8]) -> Month {
1883    let line = line.trim_ascii_start();
1884
1885    match line.get(..3).map(|x| x.to_ascii_uppercase()).as_deref() {
1886        Some(b"JAN") => Month::January,
1887        Some(b"FEB") => Month::February,
1888        Some(b"MAR") => Month::March,
1889        Some(b"APR") => Month::April,
1890        Some(b"MAY") => Month::May,
1891        Some(b"JUN") => Month::June,
1892        Some(b"JUL") => Month::July,
1893        Some(b"AUG") => Month::August,
1894        Some(b"SEP") => Month::September,
1895        Some(b"OCT") => Month::October,
1896        Some(b"NOV") => Month::November,
1897        Some(b"DEC") => Month::December,
1898        _ => Month::Unknown,
1899    }
1900}
1901
1902fn month_compare(a: &[u8], b: &[u8]) -> Ordering {
1903    let ma = month_parse(a);
1904    let mb = month_parse(b);
1905
1906    ma.cmp(&mb)
1907}
1908
1909fn print_sorted<'a, T: Iterator<Item = &'a Line<'a>>>(
1910    iter: T,
1911    settings: &GlobalSettings,
1912    output: Output,
1913) -> UResult<()> {
1914    let output_name = output
1915        .as_output_name()
1916        .unwrap_or(OsStr::new("standard output"))
1917        .to_owned();
1918    let ctx = || translate!("sort-error-write-failed", "output" => output_name.maybe_quote());
1919
1920    let mut writer = output.into_write();
1921    for line in iter {
1922        line.print(&mut writer, settings).map_err_context(ctx)?;
1923    }
1924    writer.flush().map_err_context(ctx)?;
1925    Ok(())
1926}
1927
1928fn open(path: impl AsRef<OsStr>) -> UResult<Box<dyn Read + Send>> {
1929    let path = path.as_ref();
1930    if path == STDIN_FILE {
1931        let stdin = stdin();
1932        return Ok(Box::new(stdin) as Box<dyn Read + Send>);
1933    }
1934
1935    let path = Path::new(path);
1936    match File::open(path) {
1937        Ok(f) => Ok(Box::new(f) as Box<dyn Read + Send>),
1938        Err(error) => Err(SortError::ReadFailed {
1939            path: path.to_owned(),
1940            error,
1941        }
1942        .into()),
1943    }
1944}
1945
1946fn open_with_open_failed_error(path: impl AsRef<OsStr>) -> UResult<Box<dyn Read + Send>> {
1947    // On error, returns an OpenFailed error instead of a ReadFailed error
1948    let path = path.as_ref();
1949    if path == STDIN_FILE {
1950        let stdin = stdin();
1951        return Ok(Box::new(stdin) as Box<dyn Read + Send>);
1952    }
1953
1954    let path = Path::new(path);
1955    match File::open(path) {
1956        Ok(f) => Ok(Box::new(f) as Box<dyn Read + Send>),
1957        Err(error) => Err(SortError::OpenFailed {
1958            path: path.to_owned(),
1959            error,
1960        }
1961        .into()),
1962    }
1963}
1964
1965fn format_error_message(error: &ParseSizeError, s: &str, option: &str) -> String {
1966    // NOTE:
1967    // GNU's sort echos affected flag, -S or --buffer-size, depending on user's selection
1968    match error {
1969        ParseSizeError::InvalidSuffix(_) => {
1970            translate!("sort-invalid-suffix-in-option-arg", "option" => option, "arg" => s.quote())
1971        }
1972        ParseSizeError::ParseFailure(_) | ParseSizeError::PhysicalMem(_) => {
1973            translate!("sort-invalid-option-arg", "option" => option, "arg" => s.quote())
1974        }
1975        ParseSizeError::SizeTooBig(_) => {
1976            translate!("sort-option-arg-too-large", "option" => option, "arg" => s.quote())
1977        }
1978    }
1979}
1980
1981#[cfg(test)]
1982mod tests {
1983
1984    use super::*;
1985
1986    fn tokenize_helper(line: &[u8], separator: Option<u8>) -> Vec<Field> {
1987        let mut buffer = vec![];
1988        tokenize(line, separator, &mut buffer);
1989        buffer
1990    }
1991
1992    #[test]
1993    fn test_get_hash() {
1994        let a = "Ted".to_string();
1995
1996        assert_eq!(2_646_829_031_758_483_623, get_hash(&a));
1997    }
1998
1999    #[test]
2000    fn test_random_shuffle() {
2001        let a = b"Ted";
2002        let b = b"Ted";
2003        let c = get_rand_string();
2004
2005        assert_eq!(Ordering::Equal, random_shuffle(a, b, &c));
2006    }
2007
2008    #[test]
2009    fn test_month_compare() {
2010        let a = b"JaN";
2011        let b = b"OCt";
2012
2013        assert_eq!(Ordering::Less, month_compare(a, b));
2014    }
2015    #[test]
2016    fn test_version_compare() {
2017        let a = b"1.2.3-alpha2";
2018        let b = b"1.4.0";
2019
2020        assert_eq!(Ordering::Less, version_cmp(a, b));
2021    }
2022
2023    #[test]
2024    fn test_random_compare() {
2025        let a = b"9";
2026        let b = b"9";
2027        let c = get_rand_string();
2028
2029        assert_eq!(Ordering::Equal, random_shuffle(a, b, &c));
2030    }
2031
2032    #[test]
2033    fn test_tokenize_fields() {
2034        let line = b"foo bar b    x";
2035        assert_eq!(tokenize_helper(line, None), vec![0..3, 3..7, 7..9, 9..14]);
2036    }
2037
2038    #[test]
2039    fn test_tokenize_fields_leading_whitespace() {
2040        let line = b"    foo bar b    x";
2041        assert_eq!(
2042            tokenize_helper(line, None),
2043            vec![0..7, 7..11, 11..13, 13..18]
2044        );
2045    }
2046
2047    #[test]
2048    fn test_tokenize_fields_custom_separator() {
2049        let line = b"aaa foo bar b    x";
2050        assert_eq!(
2051            tokenize_helper(line, Some(b'a')),
2052            vec![0..0, 1..1, 2..2, 3..9, 10..18]
2053        );
2054    }
2055
2056    #[test]
2057    fn test_tokenize_fields_trailing_custom_separator() {
2058        let line = b"a";
2059        assert_eq!(tokenize_helper(line, Some(b'a')), vec![0..0]);
2060        let line = b"aa";
2061        assert_eq!(tokenize_helper(line, Some(b'a')), vec![0..0, 1..1]);
2062        let line = b"..a..a";
2063        assert_eq!(tokenize_helper(line, Some(b'a')), vec![0..2, 3..5]);
2064    }
2065
2066    #[test]
2067    #[cfg(target_pointer_width = "64")]
2068    fn test_line_size() {
2069        // We should make sure to not regress the size of the Line struct because
2070        // it is unconditional overhead for every line we sort.
2071        assert_eq!(size_of::<Line>(), 24);
2072    }
2073
2074    #[test]
2075    fn test_parse_byte_count() {
2076        let valid_input = [
2077            ("0", 0),
2078            ("50K", 50 * 1024),
2079            ("50k", 50 * 1024),
2080            ("1M", 1024 * 1024),
2081            ("100M", 100 * 1024 * 1024),
2082            #[cfg(not(target_pointer_width = "32"))]
2083            ("1000G", 1000 * 1024 * 1024 * 1024),
2084            #[cfg(not(target_pointer_width = "32"))]
2085            ("10T", 10 * 1024 * 1024 * 1024 * 1024),
2086            ("1b", 1),
2087            ("1024b", 1024),
2088            ("1024Mb", 1024 * 1024 * 1024), // NOTE: This might not be how GNU `sort` behaves for 'Mb'
2089            ("1", 1024),                    // K is default
2090            ("50", 50 * 1024),
2091            ("K", 1024),
2092            ("k", 1024),
2093            ("m", 1024 * 1024),
2094            #[cfg(not(target_pointer_width = "32"))]
2095            ("E", 1024 * 1024 * 1024 * 1024 * 1024 * 1024),
2096        ];
2097        for (input, expected_output) in &valid_input {
2098            assert_eq!(
2099                GlobalSettings::parse_byte_count(input),
2100                Ok(*expected_output)
2101            );
2102        }
2103
2104        // SizeTooBig
2105        let invalid_input = ["500E", "1Y"];
2106        for input in &invalid_input {
2107            assert!(GlobalSettings::parse_byte_count(input).is_err());
2108        }
2109
2110        // ParseFailure
2111        let invalid_input = ["nonsense", "1B", "B", "b", "p", "e", "z", "y"];
2112        for input in &invalid_input {
2113            assert!(GlobalSettings::parse_byte_count(input).is_err());
2114        }
2115    }
2116}