Skip to main content

coreutils_rs/comm/
core.rs

1use std::cmp::Ordering;
2use std::io::{self, Write};
3
4/// How to handle sort-order checking.
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6pub enum OrderCheck {
7    /// Default: check, warn once per file, continue, exit 1
8    Default,
9    /// --check-order: check, error, stop immediately
10    Strict,
11    /// --nocheck-order: no checking
12    None,
13}
14
15/// Configuration for the comm command.
16pub struct CommConfig {
17    pub suppress_col1: bool,
18    pub suppress_col2: bool,
19    pub suppress_col3: bool,
20    pub case_insensitive: bool,
21    pub order_check: OrderCheck,
22    pub output_delimiter: Option<Vec<u8>>,
23    pub total: bool,
24    pub zero_terminated: bool,
25}
26
27impl Default for CommConfig {
28    fn default() -> Self {
29        Self {
30            suppress_col1: false,
31            suppress_col2: false,
32            suppress_col3: false,
33            case_insensitive: false,
34            order_check: OrderCheck::Default,
35            output_delimiter: None,
36            total: false,
37            zero_terminated: false,
38        }
39    }
40}
41
42/// Result of the comm operation.
43pub struct CommResult {
44    pub count1: usize,
45    pub count2: usize,
46    pub count3: usize,
47    pub had_order_error: bool,
48}
49
50/// Compare two byte slices, optionally case-insensitive (ASCII).
51#[inline]
52fn compare_lines(a: &[u8], b: &[u8], case_insensitive: bool) -> Ordering {
53    if case_insensitive {
54        for (&ca, &cb) in a.iter().zip(b.iter()) {
55            match ca.to_ascii_lowercase().cmp(&cb.to_ascii_lowercase()) {
56                Ordering::Equal => continue,
57                other => return other,
58            }
59        }
60        a.len().cmp(&b.len())
61    } else {
62        a.cmp(b)
63    }
64}
65
66/// Split data into lines by delimiter, using SIMD-accelerated scanning.
67/// Does NOT include a trailing empty line if data ends with the delimiter.
68fn split_lines<'a>(data: &'a [u8], delim: u8) -> Vec<&'a [u8]> {
69    if data.is_empty() {
70        return Vec::new();
71    }
72    let count = memchr::memchr_iter(delim, data).count();
73    let has_trailing = data.last() == Some(&delim);
74    let cap = if has_trailing { count } else { count + 1 };
75    let mut lines = Vec::with_capacity(cap);
76    let mut start = 0;
77    for pos in memchr::memchr_iter(delim, data) {
78        lines.push(&data[start..pos]);
79        start = pos + 1;
80    }
81    if start < data.len() {
82        lines.push(&data[start..]);
83    }
84    lines
85}
86
87/// Run the comm merge algorithm on two sorted inputs.
88pub fn comm(
89    data1: &[u8],
90    data2: &[u8],
91    config: &CommConfig,
92    tool_name: &str,
93    out: &mut impl Write,
94) -> io::Result<CommResult> {
95    let delim = if config.zero_terminated { b'\0' } else { b'\n' };
96    let sep = config.output_delimiter.as_deref().unwrap_or(b"\t");
97
98    // Build column prefixes. Each shown column before the current one
99    // contributes one copy of the separator.
100    // Column 1: always empty prefix.
101    let prefix2: Vec<u8> = if !config.suppress_col1 {
102        sep.to_vec()
103    } else {
104        Vec::new()
105    };
106    let mut prefix3: Vec<u8> = Vec::new();
107    if !config.suppress_col1 {
108        prefix3.extend_from_slice(sep);
109    }
110    if !config.suppress_col2 {
111        prefix3.extend_from_slice(sep);
112    }
113
114    let lines1 = split_lines(data1, delim);
115    let lines2 = split_lines(data2, delim);
116
117    let mut i1 = 0usize;
118    let mut i2 = 0usize;
119    let mut count1 = 0usize;
120    let mut count2 = 0usize;
121    let mut count3 = 0usize;
122    let mut had_order_error = false;
123    let mut warned1 = false;
124    let mut warned2 = false;
125    let ci = config.case_insensitive;
126
127    let mut buf = Vec::with_capacity(data1.len() + data2.len());
128
129    // Macro to check sort order of a file and handle warnings/errors.
130    macro_rules! check_order {
131        ($warned:ident, $lines:ident, $idx:ident, $file_num:expr) => {
132            if config.order_check != OrderCheck::None
133                && !$warned
134                && $idx > 0
135                && compare_lines($lines[$idx], $lines[$idx - 1], ci) == Ordering::Less
136            {
137                had_order_error = true;
138                $warned = true;
139                eprintln!("{}: file {} is not in sorted order", tool_name, $file_num);
140                if config.order_check == OrderCheck::Strict {
141                    out.write_all(&buf)?;
142                    return Ok(CommResult {
143                        count1,
144                        count2,
145                        count3,
146                        had_order_error,
147                    });
148                }
149            }
150        };
151    }
152
153    while i1 < lines1.len() && i2 < lines2.len() {
154        match compare_lines(lines1[i1], lines2[i2], ci) {
155            Ordering::Less => {
156                // File1 line is unique — check file1 sort order before consuming
157                check_order!(warned1, lines1, i1, 1);
158                if !config.suppress_col1 {
159                    buf.extend_from_slice(lines1[i1]);
160                    buf.push(delim);
161                }
162                count1 += 1;
163                i1 += 1;
164            }
165            Ordering::Greater => {
166                // File2 line is unique — check file2 sort order before consuming
167                check_order!(warned2, lines2, i2, 2);
168                if !config.suppress_col2 {
169                    buf.extend_from_slice(&prefix2);
170                    buf.extend_from_slice(lines2[i2]);
171                    buf.push(delim);
172                }
173                count2 += 1;
174                i2 += 1;
175            }
176            Ordering::Equal => {
177                // Lines match — no sort check needed (GNU comm behavior)
178                if !config.suppress_col3 {
179                    buf.extend_from_slice(&prefix3);
180                    buf.extend_from_slice(lines1[i1]);
181                    buf.push(delim);
182                }
183                count3 += 1;
184                i1 += 1;
185                i2 += 1;
186            }
187        }
188    }
189
190    // Drain remaining from file 1
191    while i1 < lines1.len() {
192        if config.order_check != OrderCheck::None
193            && !warned1
194            && i1 > 0
195            && compare_lines(lines1[i1], lines1[i1 - 1], ci) == Ordering::Less
196        {
197            had_order_error = true;
198            warned1 = true;
199            eprintln!("{}: file 1 is not in sorted order", tool_name);
200            if config.order_check == OrderCheck::Strict {
201                out.write_all(&buf)?;
202                return Ok(CommResult {
203                    count1,
204                    count2,
205                    count3,
206                    had_order_error,
207                });
208            }
209        }
210        if !config.suppress_col1 {
211            buf.extend_from_slice(lines1[i1]);
212            buf.push(delim);
213        }
214        count1 += 1;
215        i1 += 1;
216    }
217
218    // Drain remaining from file 2
219    while i2 < lines2.len() {
220        if config.order_check != OrderCheck::None
221            && !warned2
222            && i2 > 0
223            && compare_lines(lines2[i2], lines2[i2 - 1], ci) == Ordering::Less
224        {
225            had_order_error = true;
226            warned2 = true;
227            eprintln!("{}: file 2 is not in sorted order", tool_name);
228            if config.order_check == OrderCheck::Strict {
229                out.write_all(&buf)?;
230                return Ok(CommResult {
231                    count1,
232                    count2,
233                    count3,
234                    had_order_error,
235                });
236            }
237        }
238        if !config.suppress_col2 {
239            buf.extend_from_slice(&prefix2);
240            buf.extend_from_slice(lines2[i2]);
241            buf.push(delim);
242        }
243        count2 += 1;
244        i2 += 1;
245    }
246
247    // Total summary line
248    if config.total {
249        write!(&mut buf, "{}", count1)?;
250        buf.extend_from_slice(sep);
251        write!(&mut buf, "{}", count2)?;
252        buf.extend_from_slice(sep);
253        write!(&mut buf, "{}", count3)?;
254        buf.extend_from_slice(sep);
255        buf.extend_from_slice(b"total");
256        buf.push(delim);
257    }
258
259    // In Default mode, print a final summary message (matches GNU comm behavior)
260    if had_order_error && config.order_check == OrderCheck::Default {
261        eprintln!("{}: input is not in sorted order", tool_name);
262    }
263
264    out.write_all(&buf)?;
265    Ok(CommResult {
266        count1,
267        count2,
268        count3,
269        had_order_error,
270    })
271}