Skip to main content

uu_join/
join.rs

1// This file is part of the uutils coreutils package.
2//
3// For the full copyright and license information, please view the LICENSE
4// file that was distributed with this source code.
5
6// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck memmem
7
8use clap::builder::ValueParser;
9use clap::{Arg, ArgAction, Command};
10use memchr::{Memchr3, memchr_iter, memmem::Finder};
11use std::cmp::Ordering;
12use std::ffi::OsString;
13use std::fs::File;
14use std::io::{BufRead, BufReader, BufWriter, Split, Stdin, Write, stdin, stdout};
15use std::num::IntErrorKind;
16#[cfg(unix)]
17use std::os::unix::ffi::OsStrExt;
18use thiserror::Error;
19use uucore::display::Quotable;
20use uucore::error::{FromIo, UError, UResult, USimpleError, set_exit_code};
21use uucore::format_usage;
22use uucore::i18n::collator::{
23    AlternateHandling, CollatorOptions, locale_cmp, should_use_locale_collation, try_init_collator,
24};
25use uucore::line_ending::LineEnding;
26use uucore::translate;
27
28#[derive(Debug, Error)]
29enum JoinError {
30    #[error("{}", translate!("join-error-io", "error" => .0))]
31    IOError(#[from] std::io::Error),
32
33    #[error("{0}")]
34    UnorderedInput(String),
35}
36
37// If you still need the UError implementation for compatibility:
38impl UError for JoinError {
39    fn code(&self) -> i32 {
40        1
41    }
42}
43
44#[derive(Copy, Clone, PartialEq)]
45enum FileNum {
46    File1,
47    File2,
48}
49
50#[derive(Clone)]
51enum SepSetting {
52    /// Any single-byte separator.
53    Byte(u8),
54    /// A single character more than one byte long.
55    Char(Vec<u8>),
56    /// No separators, join on the entire line.
57    Line,
58    /// Whitespace separators.
59    Whitespaces,
60}
61
62trait Separator: Clone {
63    /// Using this separator, return the start and end index of all fields in the haystack.
64    fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)>;
65    /// The separator as it appears when in the output.
66    fn output_separator(&self) -> &[u8];
67}
68
69/// Simple separators one byte in length.
70#[derive(Copy, Clone)]
71struct OneByteSep {
72    byte: [u8; 1],
73}
74
75impl Separator for OneByteSep {
76    fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
77        let mut field_ranges = Vec::with_capacity(len_guess);
78        let mut last_end = 0;
79
80        for i in memchr_iter(self.byte[0], haystack) {
81            field_ranges.push((last_end, i));
82            last_end = i + 1;
83        }
84        field_ranges.push((last_end, haystack.len()));
85        field_ranges
86    }
87
88    fn output_separator(&self) -> &[u8] {
89        &self.byte
90    }
91}
92
93/// Multi-byte (but still single character) separators.
94#[derive(Clone)]
95struct MultiByteSep<'a> {
96    finder: Finder<'a>,
97}
98
99impl Separator for MultiByteSep<'_> {
100    fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
101        let mut field_ranges = Vec::with_capacity(len_guess);
102        let mut last_end = 0;
103
104        for i in self.finder.find_iter(haystack) {
105            field_ranges.push((last_end, i));
106            last_end = i + self.finder.needle().len();
107        }
108        field_ranges.push((last_end, haystack.len()));
109        field_ranges
110    }
111
112    fn output_separator(&self) -> &[u8] {
113        self.finder.needle()
114    }
115}
116
117/// Whole-line separator.
118#[derive(Copy, Clone)]
119struct LineSep {}
120
121impl Separator for LineSep {
122    fn field_ranges(&self, haystack: &[u8], _len_guess: usize) -> Vec<(usize, usize)> {
123        vec![(0, haystack.len())]
124    }
125
126    fn output_separator(&self) -> &[u8] {
127        &[]
128    }
129}
130
131/// Default whitespace separator.
132#[derive(Copy, Clone)]
133struct WhitespaceSep {}
134
135impl Separator for WhitespaceSep {
136    fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
137        let mut field_ranges = Vec::with_capacity(len_guess);
138        let mut last_end = 0;
139
140        // GNU join used Bourne shell field splitters by default
141        // FIXME: but now uses locale-dependent whitespace
142        for i in Memchr3::new(b' ', b'\t', b'\n', haystack) {
143            // leading whitespace should be dropped, contiguous whitespace merged
144            if i > last_end {
145                field_ranges.push((last_end, i));
146            }
147            last_end = i + 1;
148        }
149        field_ranges.push((last_end, haystack.len()));
150        field_ranges
151    }
152
153    fn output_separator(&self) -> &[u8] {
154        b" "
155    }
156}
157
158#[derive(Copy, Clone, PartialEq)]
159enum CheckOrder {
160    Default,
161    Disabled,
162    Enabled,
163}
164
165struct Settings {
166    key1: usize,
167    key2: usize,
168    print_unpaired1: bool,
169    print_unpaired2: bool,
170    print_joined: bool,
171    ignore_case: bool,
172    line_ending: LineEnding,
173    separator: SepSetting,
174    autoformat: bool,
175    format: Vec<Spec>,
176    empty: Vec<u8>,
177    check_order: CheckOrder,
178    headers: bool,
179}
180
181impl Default for Settings {
182    fn default() -> Self {
183        Self {
184            key1: 0,
185            key2: 0,
186            print_unpaired1: false,
187            print_unpaired2: false,
188            print_joined: true,
189            ignore_case: false,
190            line_ending: LineEnding::Newline,
191            separator: SepSetting::Whitespaces,
192            autoformat: false,
193            format: vec![],
194            empty: vec![],
195            check_order: CheckOrder::Default,
196            headers: false,
197        }
198    }
199}
200
201/// Output representation.
202struct Repr<'a, Sep: Separator> {
203    line_ending: LineEnding,
204    separator: Sep,
205    format: Vec<Spec>,
206    empty: &'a [u8],
207}
208
209impl<'a, Sep: Separator> Repr<'a, Sep> {
210    fn new(line_ending: LineEnding, separator: Sep, format: Vec<Spec>, empty: &'a [u8]) -> Self {
211        Repr {
212            line_ending,
213            separator,
214            format,
215            empty,
216        }
217    }
218
219    fn uses_format(&self) -> bool {
220        !self.format.is_empty()
221    }
222
223    /// Write the field or empty filler if the field is not set.
224    fn write_field(
225        &self,
226        writer: &mut impl Write,
227        field: Option<&[u8]>,
228    ) -> Result<(), std::io::Error> {
229        let value = match field {
230            Some(field) => field,
231            None => self.empty,
232        };
233
234        writer.write_all(value)
235    }
236
237    /// Write each field except the one at the index.
238    fn write_fields(
239        &self,
240        writer: &mut impl Write,
241        line: &Line,
242        index: usize,
243    ) -> Result<(), std::io::Error> {
244        for i in 0..line.field_ranges.len() {
245            if i != index {
246                writer.write_all(self.separator.output_separator())?;
247                writer.write_all(line.get_field(i).unwrap())?;
248            }
249        }
250        Ok(())
251    }
252
253    /// Write each field or the empty filler if the field is not set.
254    fn write_format<F>(&self, writer: &mut impl Write, f: F) -> Result<(), std::io::Error>
255    where
256        F: Fn(&Spec) -> Option<&'a [u8]>,
257    {
258        for i in 0..self.format.len() {
259            if i > 0 {
260                writer.write_all(self.separator.output_separator())?;
261            }
262
263            let field = match f(&self.format[i]) {
264                Some(value) => value,
265                None => self.empty,
266            };
267
268            writer.write_all(field)?;
269        }
270        Ok(())
271    }
272
273    fn write_line_ending(&self, writer: &mut impl Write) -> Result<(), std::io::Error> {
274        writer.write_all(&[self.line_ending as u8])
275    }
276}
277
278/// Byte slice wrapper whose Ord implementation is case-insensitive on ASCII.
279#[derive(Eq)]
280struct CaseInsensitiveSlice<'a> {
281    v: &'a [u8],
282}
283
284impl Ord for CaseInsensitiveSlice<'_> {
285    fn cmp(&self, other: &Self) -> Ordering {
286        if let Some((s, o)) =
287            std::iter::zip(self.v.iter(), other.v.iter()).find(|(s, o)| !s.eq_ignore_ascii_case(o))
288        {
289            // first characters that differ, return the case-insensitive comparison
290            let s = s.to_ascii_lowercase();
291            let o = o.to_ascii_lowercase();
292            s.cmp(&o)
293        } else {
294            // one of the strings is a substring or equal of the other
295            self.v.len().cmp(&other.v.len())
296        }
297    }
298}
299
300impl PartialOrd for CaseInsensitiveSlice<'_> {
301    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
302        Some(self.cmp(other))
303    }
304}
305
306impl PartialEq for CaseInsensitiveSlice<'_> {
307    fn eq(&self, other: &Self) -> bool {
308        self.v.eq_ignore_ascii_case(other.v)
309    }
310}
311
312/// Input processing parameters.
313struct Input<Sep: Separator> {
314    separator: Sep,
315    ignore_case: bool,
316    check_order: CheckOrder,
317    use_locale: bool,
318}
319
320impl<Sep: Separator> Input<Sep> {
321    fn new(separator: Sep, ignore_case: bool, check_order: CheckOrder, use_locale: bool) -> Self {
322        Self {
323            separator,
324            ignore_case,
325            check_order,
326            use_locale,
327        }
328    }
329
330    fn compare(&self, field1: Option<&[u8]>, field2: Option<&[u8]>) -> Ordering {
331        if let (Some(field1), Some(field2)) = (field1, field2) {
332            if self.ignore_case {
333                let field1 = CaseInsensitiveSlice { v: field1 };
334                let field2 = CaseInsensitiveSlice { v: field2 };
335                field1.cmp(&field2)
336            } else if self.use_locale {
337                locale_cmp(field1, field2)
338            } else {
339                field1.cmp(field2)
340            }
341        } else {
342            match field1 {
343                Some(_) => Ordering::Greater,
344                None => match field2 {
345                    Some(_) => Ordering::Less,
346                    None => Ordering::Equal,
347                },
348            }
349        }
350    }
351}
352
353enum Spec {
354    Key,
355    Field(FileNum, usize),
356}
357
358impl Spec {
359    fn parse(format: &str) -> UResult<Self> {
360        let mut chars = format.chars();
361
362        let file_num = match chars.next() {
363            Some('0') => {
364                // Must be all alone without a field specifier.
365                if chars.next().is_none() {
366                    return Ok(Self::Key);
367                }
368                return Err(USimpleError::new(
369                    1,
370                    translate!("join-error-invalid-field-specifier", "spec" => format.quote()),
371                ));
372            }
373            Some('1') => FileNum::File1,
374            Some('2') => FileNum::File2,
375            _ => {
376                return Err(USimpleError::new(
377                    1,
378                    translate!("join-error-invalid-file-number", "spec" => format.quote()),
379                ));
380            }
381        };
382
383        if let Some('.') = chars.next() {
384            return Ok(Self::Field(file_num, parse_field_number(chars.as_str())?));
385        }
386
387        Err(USimpleError::new(
388            1,
389            translate!("join-error-invalid-field-specifier", "spec" => format.quote()),
390        ))
391    }
392}
393
394struct Line {
395    field_ranges: Vec<(usize, usize)>,
396    string: Vec<u8>,
397}
398
399impl Line {
400    fn new<Sep: Separator>(string: Vec<u8>, separator: &Sep, len_guess: usize) -> Self {
401        let field_ranges = separator.field_ranges(&string, len_guess);
402
403        Self {
404            field_ranges,
405            string,
406        }
407    }
408
409    /// Get field at index.
410    fn get_field(&self, index: usize) -> Option<&[u8]> {
411        if index < self.field_ranges.len() {
412            let (low, high) = self.field_ranges[index];
413            Some(&self.string[low..high])
414        } else {
415            None
416        }
417    }
418}
419
420struct State<'a> {
421    key: usize,
422    file_name: &'a OsString,
423    file_num: FileNum,
424    print_unpaired: bool,
425    lines: Split<Box<dyn BufRead + 'a>>,
426    max_len: usize,
427    seq: Vec<Line>,
428    line_num: usize,
429    has_failed: bool,
430    has_unpaired: bool,
431}
432
433impl<'a> State<'a> {
434    fn new(
435        file_num: FileNum,
436        name: &'a OsString,
437        stdin: &'a Stdin,
438        key: usize,
439        line_ending: LineEnding,
440        print_unpaired: bool,
441    ) -> UResult<Self> {
442        let file_buf = if name == "-" {
443            Box::new(stdin.lock()) as Box<dyn BufRead>
444        } else {
445            let file = File::open(name).map_err_context(|| format!("{}", name.maybe_quote()))?;
446            Box::new(BufReader::new(file)) as Box<dyn BufRead>
447        };
448
449        Ok(State {
450            key,
451            file_name: name,
452            file_num,
453            print_unpaired,
454            lines: file_buf.split(line_ending as u8),
455            max_len: 1,
456            seq: Vec::new(),
457            line_num: 0,
458            has_failed: false,
459            has_unpaired: false,
460        })
461    }
462
463    /// Skip the current unpaired line.
464    fn skip_line<Sep: Separator>(
465        &mut self,
466        writer: &mut impl Write,
467        input: &Input<Sep>,
468        repr: &Repr<'a, Sep>,
469    ) -> UResult<()> {
470        if self.print_unpaired {
471            self.write_first_line(writer, repr)?;
472        }
473
474        self.reset_next_line(input)?;
475        Ok(())
476    }
477
478    /// Keep reading line sequence until the key does not change, return
479    /// the first line whose key differs.
480    fn extend<Sep: Separator>(&mut self, input: &Input<Sep>) -> UResult<Option<Line>> {
481        while let Some(line) = self.next_line(input)? {
482            let diff = input.compare(self.get_current_key(), line.get_field(self.key));
483
484            if diff == Ordering::Equal {
485                self.seq.push(line);
486            } else {
487                return Ok(Some(line));
488            }
489        }
490
491        Ok(None)
492    }
493
494    /// Write lines in the buffers as headers.
495    fn write_headers<Sep: Separator>(
496        &self,
497        writer: &mut impl Write,
498        other: &State,
499        repr: &Repr<'a, Sep>,
500    ) -> Result<(), std::io::Error> {
501        if self.has_line() {
502            if other.has_line() {
503                self.combine(writer, other, repr)?;
504            } else {
505                self.write_first_line(writer, repr)?;
506            }
507        } else if other.has_line() {
508            other.write_first_line(writer, repr)?;
509        }
510
511        Ok(())
512    }
513
514    /// Combine two line sequences.
515    fn combine<Sep: Separator>(
516        &self,
517        writer: &mut impl Write,
518        other: &State,
519        repr: &Repr<'a, Sep>,
520    ) -> Result<(), std::io::Error> {
521        let key = self.get_current_key();
522
523        for line1 in &self.seq {
524            for line2 in &other.seq {
525                if repr.uses_format() {
526                    repr.write_format(writer, |spec| match *spec {
527                        Spec::Key => key,
528                        Spec::Field(file_num, field_num) => {
529                            if file_num == self.file_num {
530                                return line1.get_field(field_num);
531                            }
532
533                            if file_num == other.file_num {
534                                return line2.get_field(field_num);
535                            }
536
537                            None
538                        }
539                    })?;
540                } else {
541                    repr.write_field(writer, key)?;
542                    repr.write_fields(writer, line1, self.key)?;
543                    repr.write_fields(writer, line2, other.key)?;
544                }
545
546                repr.write_line_ending(writer)?;
547            }
548        }
549
550        Ok(())
551    }
552
553    /// Reset with the next line.
554    fn reset(&mut self, next_line: Option<Line>) {
555        self.seq.clear();
556
557        if let Some(line) = next_line {
558            self.seq.push(line);
559        }
560    }
561
562    fn reset_read_line<Sep: Separator>(
563        &mut self,
564        input: &Input<Sep>,
565    ) -> Result<(), std::io::Error> {
566        let line = self.read_line(&input.separator)?;
567        self.reset(line);
568        Ok(())
569    }
570
571    fn reset_next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<(), JoinError> {
572        let line = self.next_line(input)?;
573        self.reset(line);
574        Ok(())
575    }
576
577    fn has_line(&self) -> bool {
578        !self.seq.is_empty()
579    }
580
581    fn initialize<Sep: Separator>(
582        &mut self,
583        read_sep: &Sep,
584        autoformat: bool,
585    ) -> std::io::Result<usize> {
586        if let Some(line) = self.read_line(read_sep)? {
587            self.seq.push(line);
588
589            if autoformat {
590                return Ok(self.seq[0].field_ranges.len());
591            }
592        }
593        Ok(0)
594    }
595
596    fn finalize<Sep: Separator>(
597        &mut self,
598        writer: &mut impl Write,
599        input: &Input<Sep>,
600        repr: &Repr<'a, Sep>,
601    ) -> UResult<()> {
602        if self.has_line() {
603            if self.print_unpaired {
604                self.write_first_line(writer, repr)?;
605            }
606
607            let mut next_line = self.next_line(input)?;
608            while let Some(line) = &next_line {
609                if self.print_unpaired {
610                    self.write_line(writer, line, repr)?;
611                }
612                self.reset(next_line);
613                next_line = self.next_line(input)?;
614            }
615        }
616
617        Ok(())
618    }
619
620    /// Get the next line without the order check.
621    fn read_line<Sep: Separator>(&mut self, sep: &Sep) -> Result<Option<Line>, std::io::Error> {
622        match self.lines.next() {
623            Some(value) => {
624                self.line_num += 1;
625                let line = Line::new(value?, sep, self.max_len);
626                if line.field_ranges.len() > self.max_len {
627                    self.max_len = line.field_ranges.len();
628                }
629                Ok(Some(line))
630            }
631            None => Ok(None),
632        }
633    }
634
635    /// Get the next line with the order check.
636    fn next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<Option<Line>, JoinError> {
637        if let Some(line) = self.read_line(&input.separator)? {
638            if input.check_order == CheckOrder::Disabled {
639                return Ok(Some(line));
640            }
641
642            let diff = input.compare(self.get_current_key(), line.get_field(self.key));
643
644            if diff == Ordering::Greater
645                && (input.check_order == CheckOrder::Enabled
646                    || (self.has_unpaired && !self.has_failed))
647            {
648                let err_msg = translate!("join-error-not-sorted", "file" => self.file_name.maybe_quote(), "line_num" => self.line_num, "content" => String::from_utf8_lossy(&line.string));
649                // This is fatal if the check is enabled.
650                if input.check_order == CheckOrder::Enabled {
651                    return Err(JoinError::UnorderedInput(err_msg));
652                }
653                eprintln!("{}: {err_msg}", uucore::execution_phrase());
654                self.has_failed = true;
655            }
656
657            Ok(Some(line))
658        } else {
659            Ok(None)
660        }
661    }
662
663    /// Gets the key value of the lines stored in seq.
664    fn get_current_key(&self) -> Option<&[u8]> {
665        self.seq[0].get_field(self.key)
666    }
667
668    fn write_line<Sep: Separator>(
669        &self,
670        writer: &mut impl Write,
671        line: &Line,
672        repr: &Repr<'a, Sep>,
673    ) -> Result<(), std::io::Error> {
674        if repr.uses_format() {
675            repr.write_format(writer, |spec| match *spec {
676                Spec::Key => line.get_field(self.key),
677                Spec::Field(file_num, field_num) => {
678                    if file_num == self.file_num {
679                        line.get_field(field_num)
680                    } else {
681                        None
682                    }
683                }
684            })?;
685        } else {
686            repr.write_field(writer, line.get_field(self.key))?;
687            repr.write_fields(writer, line, self.key)?;
688        }
689
690        repr.write_line_ending(writer)
691    }
692
693    fn write_first_line<Sep: Separator>(
694        &self,
695        writer: &mut impl Write,
696        repr: &Repr<'a, Sep>,
697    ) -> Result<(), std::io::Error> {
698        self.write_line(writer, &self.seq[0], repr)
699    }
700}
701
702fn parse_separator(value_os: &OsString) -> UResult<SepSetting> {
703    // Five possible separator values:
704    // No argument supplied, separate on whitespace; handled implicitly as the default elsewhere
705    // An empty string arg, whole line separation
706    // On unix-likes only, a single arbitrary byte
707    // The two-character "\0" string, interpreted as a single 0 byte
708    // A single scalar valid in the locale encoding (currently only UTF-8)
709
710    if value_os.is_empty() {
711        return Ok(SepSetting::Line);
712    }
713
714    #[cfg(unix)]
715    {
716        let value = value_os.as_bytes();
717        if value.len() == 1 {
718            return Ok(SepSetting::Byte(value[0]));
719        }
720    }
721
722    let Some(value) = value_os.to_str() else {
723        #[cfg(unix)]
724        return Err(USimpleError::new(1, translate!("join-error-non-utf8-tab")));
725        #[cfg(not(unix))]
726        return Err(USimpleError::new(
727            1,
728            translate!("join-error-unprintable-separators"),
729        ));
730    };
731
732    let mut chars = value.chars();
733    let c = chars.next().expect("valid string with at least one byte");
734    match chars.next() {
735        None => Ok(SepSetting::Char(value.into())),
736        Some('0') if c == '\\' => Ok(SepSetting::Byte(0)),
737        _ => Err(USimpleError::new(
738            1,
739            translate!("join-error-multi-character-tab", "value" => value),
740        )),
741    }
742}
743
744fn parse_print_settings(matches: &clap::ArgMatches) -> UResult<(bool, bool, bool)> {
745    let mut print_joined = true;
746    let mut print_unpaired1 = false;
747    let mut print_unpaired2 = false;
748
749    let v_values = matches.get_many::<String>("v");
750    if v_values.is_some() {
751        print_joined = false;
752    }
753
754    let unpaired = v_values
755        .unwrap_or_default()
756        .chain(matches.get_many("a").unwrap_or_default());
757    for file_num in unpaired {
758        match parse_file_number(file_num)? {
759            FileNum::File1 => print_unpaired1 = true,
760            FileNum::File2 => print_unpaired2 = true,
761        }
762    }
763
764    Ok((print_joined, print_unpaired1, print_unpaired2))
765}
766
767fn get_and_parse_field_number(matches: &clap::ArgMatches, key: &str) -> UResult<Option<usize>> {
768    let value = matches.get_one::<String>(key).map(|s| s.as_str());
769    parse_field_number_option(value)
770}
771
772/// Parses the command-line arguments and constructs a `Settings` struct.
773///
774/// This function takes the matches from the command-line arguments, processes them,
775/// and returns a `Settings` struct that encapsulates the configuration for the program.
776#[allow(clippy::field_reassign_with_default)]
777fn parse_settings(matches: &clap::ArgMatches) -> UResult<Settings> {
778    let keys = get_and_parse_field_number(matches, "j")?;
779    let key1 = get_and_parse_field_number(matches, "1")?;
780    let key2 = get_and_parse_field_number(matches, "2")?;
781
782    let (print_joined, print_unpaired1, print_unpaired2) = parse_print_settings(matches)?;
783
784    let mut settings = Settings::default();
785
786    settings.print_joined = print_joined;
787    settings.print_unpaired1 = print_unpaired1;
788    settings.print_unpaired2 = print_unpaired2;
789
790    settings.ignore_case = matches.get_flag("i");
791    settings.key1 = get_field_number(keys, key1)?;
792    settings.key2 = get_field_number(keys, key2)?;
793    if let Some(value_os) = matches.get_one::<OsString>("t") {
794        settings.separator = parse_separator(value_os)?;
795    }
796    if let Some(format) = matches.get_one::<String>("o") {
797        if format == "auto" {
798            settings.autoformat = true;
799        } else {
800            let mut specs = vec![];
801            for part in format.split([' ', ',', '\t']) {
802                specs.push(Spec::parse(part)?);
803            }
804            settings.format = specs;
805        }
806    }
807
808    if let Some(empty) = matches.get_one::<String>("e") {
809        settings.empty = empty.as_bytes().to_vec();
810    }
811
812    if matches.get_flag("nocheck-order") {
813        settings.check_order = CheckOrder::Disabled;
814    }
815
816    if matches.get_flag("check-order") {
817        settings.check_order = CheckOrder::Enabled;
818    }
819
820    if matches.get_flag("header") {
821        settings.headers = true;
822    }
823
824    settings.line_ending = LineEnding::from_zero_flag(matches.get_flag("z"));
825
826    Ok(settings)
827}
828
829#[uucore::main]
830pub fn uumain(args: impl uucore::Args) -> UResult<()> {
831    let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
832
833    let mut opts = CollatorOptions::default();
834    opts.alternate_handling = Some(AlternateHandling::Shifted);
835    let _ = try_init_collator(opts);
836
837    let settings = parse_settings(&matches)?;
838
839    let file1 = matches.get_one::<OsString>("file1").unwrap();
840    let file2 = matches.get_one::<OsString>("file2").unwrap();
841
842    if file1 == "-" && file2 == "-" {
843        return Err(USimpleError::new(
844            1,
845            translate!("join-error-both-files-stdin"),
846        ));
847    }
848
849    let sep = settings.separator.clone();
850    match sep {
851        SepSetting::Byte(byte) => exec(file1, file2, settings, OneByteSep { byte: [byte] }),
852        SepSetting::Char(c) => exec(
853            file1,
854            file2,
855            settings,
856            MultiByteSep {
857                finder: Finder::new(&c),
858            },
859        ),
860        SepSetting::Whitespaces => exec(file1, file2, settings, WhitespaceSep {}),
861        SepSetting::Line => exec(file1, file2, settings, LineSep {}),
862    }
863}
864
865pub fn uu_app() -> Command {
866    Command::new(uucore::util_name())
867        .version(uucore::crate_version!())
868        .help_template(uucore::localized_help_template(uucore::util_name()))
869        .about(translate!("join-about"))
870        .override_usage(format_usage(&translate!("join-usage")))
871        .infer_long_args(true)
872        .arg(
873            Arg::new("a")
874                .short('a')
875                .action(ArgAction::Append)
876                .num_args(1)
877                .value_parser(["1", "2"])
878                .value_name("FILENUM")
879                .help(translate!("join-help-a")),
880        )
881        .arg(
882            Arg::new("v")
883                .short('v')
884                .action(ArgAction::Append)
885                .num_args(1)
886                .value_parser(["1", "2"])
887                .value_name("FILENUM")
888                .help(translate!("join-help-v")),
889        )
890        .arg(
891            Arg::new("e")
892                .short('e')
893                .value_name("EMPTY")
894                .help(translate!("join-help-e")),
895        )
896        .arg(
897            Arg::new("i")
898                .short('i')
899                .long("ignore-case")
900                .help(translate!("join-help-i"))
901                .action(ArgAction::SetTrue),
902        )
903        .arg(
904            Arg::new("j")
905                .short('j')
906                .value_name("FIELD")
907                .help(translate!("join-help-j")),
908        )
909        .arg(
910            Arg::new("o")
911                .short('o')
912                .value_name("FORMAT")
913                .help(translate!("join-help-o")),
914        )
915        .arg(
916            Arg::new("t")
917                .short('t')
918                .value_name("CHAR")
919                .value_parser(ValueParser::os_string())
920                .help(translate!("join-help-t")),
921        )
922        .arg(
923            Arg::new("1")
924                .short('1')
925                .value_name("FIELD")
926                .help(translate!("join-help-1")),
927        )
928        .arg(
929            Arg::new("2")
930                .short('2')
931                .value_name("FIELD")
932                .help(translate!("join-help-2")),
933        )
934        .arg(
935            Arg::new("check-order")
936                .long("check-order")
937                .help(translate!("join-help-check-order"))
938                .action(ArgAction::SetTrue),
939        )
940        .arg(
941            Arg::new("nocheck-order")
942                .long("nocheck-order")
943                .help(translate!("join-help-nocheck-order"))
944                .action(ArgAction::SetTrue),
945        )
946        .arg(
947            Arg::new("header")
948                .long("header")
949                .help(translate!("join-help-header"))
950                .action(ArgAction::SetTrue),
951        )
952        .arg(
953            Arg::new("z")
954                .short('z')
955                .long("zero-terminated")
956                .help(translate!("join-help-z"))
957                .action(ArgAction::SetTrue),
958        )
959        .arg(
960            Arg::new("file1")
961                .required(true)
962                .value_name("FILE1")
963                .value_hint(clap::ValueHint::FilePath)
964                .value_parser(clap::value_parser!(OsString))
965                .hide(true),
966        )
967        .arg(
968            Arg::new("file2")
969                .required(true)
970                .value_name("FILE2")
971                .value_hint(clap::ValueHint::FilePath)
972                .value_parser(clap::value_parser!(OsString))
973                .hide(true),
974        )
975}
976
977fn exec<Sep: Separator>(
978    file1: &OsString,
979    file2: &OsString,
980    settings: Settings,
981    sep: Sep,
982) -> UResult<()> {
983    let stdin = stdin();
984
985    let mut state1 = State::new(
986        FileNum::File1,
987        file1,
988        &stdin,
989        settings.key1,
990        settings.line_ending,
991        settings.print_unpaired1,
992    )?;
993
994    let mut state2 = State::new(
995        FileNum::File2,
996        file2,
997        &stdin,
998        settings.key2,
999        settings.line_ending,
1000        settings.print_unpaired2,
1001    )?;
1002
1003    let input = Input::new(
1004        sep.clone(),
1005        settings.ignore_case,
1006        settings.check_order,
1007        should_use_locale_collation(),
1008    );
1009
1010    let format = if settings.autoformat {
1011        let mut format = vec![Spec::Key];
1012        let mut initialize = |state: &mut State| -> UResult<()> {
1013            let max_fields = state.initialize(&sep, settings.autoformat)?;
1014            for i in 0..max_fields {
1015                if i != state.key {
1016                    format.push(Spec::Field(state.file_num, i));
1017                }
1018            }
1019            Ok(())
1020        };
1021        initialize(&mut state1)?;
1022        initialize(&mut state2)?;
1023        format
1024    } else {
1025        state1.initialize(&sep, settings.autoformat)?;
1026        state2.initialize(&sep, settings.autoformat)?;
1027        settings.format
1028    };
1029
1030    let repr = Repr::new(settings.line_ending, sep, format, &settings.empty);
1031
1032    let stdout = stdout();
1033    let mut writer = BufWriter::new(stdout.lock());
1034
1035    if settings.headers {
1036        state1.write_headers(&mut writer, &state2, &repr)?;
1037        state1.reset_read_line(&input)?;
1038        state2.reset_read_line(&input)?;
1039    }
1040
1041    while state1.has_line() && state2.has_line() {
1042        let diff = input.compare(state1.get_current_key(), state2.get_current_key());
1043
1044        match diff {
1045            Ordering::Less => {
1046                if let Err(e) = state1.skip_line(&mut writer, &input, &repr) {
1047                    writer.flush()?;
1048                    return Err(e);
1049                }
1050                state1.has_unpaired = true;
1051                state2.has_unpaired = true;
1052            }
1053            Ordering::Greater => {
1054                if let Err(e) = state2.skip_line(&mut writer, &input, &repr) {
1055                    writer.flush()?;
1056                    return Err(e);
1057                }
1058                state1.has_unpaired = true;
1059                state2.has_unpaired = true;
1060            }
1061            Ordering::Equal => {
1062                let next_line1 = match state1.extend(&input) {
1063                    Ok(line) => line,
1064                    Err(e) => {
1065                        writer.flush()?;
1066                        return Err(e);
1067                    }
1068                };
1069                let next_line2 = match state2.extend(&input) {
1070                    Ok(line) => line,
1071                    Err(e) => {
1072                        writer.flush()?;
1073                        return Err(e);
1074                    }
1075                };
1076
1077                if settings.print_joined {
1078                    state1.combine(&mut writer, &state2, &repr)?;
1079                }
1080
1081                state1.reset(next_line1);
1082                state2.reset(next_line2);
1083            }
1084        }
1085    }
1086
1087    if let Err(e) = state1.finalize(&mut writer, &input, &repr) {
1088        writer.flush()?;
1089        return Err(e);
1090    }
1091    if let Err(e) = state2.finalize(&mut writer, &input, &repr) {
1092        writer.flush()?;
1093        return Err(e);
1094    }
1095
1096    writer.flush()?;
1097
1098    if state1.has_failed || state2.has_failed {
1099        eprintln!(
1100            "{}: {}",
1101            uucore::execution_phrase(),
1102            translate!("join-error-input-not-sorted")
1103        );
1104        set_exit_code(1);
1105    }
1106    Ok(())
1107}
1108
1109/// Check that keys for both files and for a particular file are not
1110/// contradictory and return the key index.
1111fn get_field_number(keys: Option<usize>, key: Option<usize>) -> UResult<usize> {
1112    if let Some(keys) = keys {
1113        if let Some(key) = key {
1114            if keys != key {
1115                // Show zero-based field numbers as one-based.
1116                return Err(USimpleError::new(
1117                    1,
1118                    translate!("join-error-incompatible-fields", "field1" => (keys + 1), "field2" => (key + 1)),
1119                ));
1120            }
1121        }
1122
1123        return Ok(keys);
1124    }
1125
1126    Ok(key.unwrap_or(0))
1127}
1128
1129/// Parse the specified field string as a natural number and return
1130/// the zero-based field number.
1131fn parse_field_number(value: &str) -> UResult<usize> {
1132    match value.parse::<usize>() {
1133        Ok(result) if result > 0 => Ok(result - 1),
1134        Err(e) if e.kind() == &IntErrorKind::PosOverflow => Ok(usize::MAX),
1135        _ => Err(USimpleError::new(
1136            1,
1137            translate!("join-error-invalid-field-number", "value" => value.quote()),
1138        )),
1139    }
1140}
1141
1142fn parse_file_number(value: &str) -> UResult<FileNum> {
1143    match value {
1144        "1" => Ok(FileNum::File1),
1145        "2" => Ok(FileNum::File2),
1146        value => Err(USimpleError::new(
1147            1,
1148            translate!("join-error-invalid-file-number-simple", "value" => value.quote()),
1149        )),
1150    }
1151}
1152
1153fn parse_field_number_option(value: Option<&str>) -> UResult<Option<usize>> {
1154    match value {
1155        None => Ok(None),
1156        Some(val) => Ok(Some(parse_field_number(val)?)),
1157    }
1158}