use std::cmp::Ordering;
use std::io::{self, Write};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OrderCheck {
Default,
Strict,
None,
}
pub struct CommConfig {
pub suppress_col1: bool,
pub suppress_col2: bool,
pub suppress_col3: bool,
pub case_insensitive: bool,
pub order_check: OrderCheck,
pub output_delimiter: Option<Vec<u8>>,
pub total: bool,
pub zero_terminated: bool,
}
impl Default for CommConfig {
fn default() -> Self {
Self {
suppress_col1: false,
suppress_col2: false,
suppress_col3: false,
case_insensitive: false,
order_check: OrderCheck::Default,
output_delimiter: None,
total: false,
zero_terminated: false,
}
}
}
pub struct CommResult {
pub count1: usize,
pub count2: usize,
pub count3: usize,
pub had_order_error: bool,
}
#[inline(always)]
fn compare_lines(a: &[u8], b: &[u8], case_insensitive: bool) -> Ordering {
if case_insensitive {
for (&ca, &cb) in a.iter().zip(b.iter()) {
match ca.to_ascii_lowercase().cmp(&cb.to_ascii_lowercase()) {
Ordering::Equal => continue,
other => return other,
}
}
a.len().cmp(&b.len())
} else {
a.cmp(b)
}
}
#[inline(always)]
unsafe fn write_line(buf: &mut Vec<u8>, prefix: &[u8], line: &[u8], delim: u8) {
unsafe {
let start = buf.len();
let total = prefix.len() + line.len() + 1;
let dst = buf.as_mut_ptr().add(start);
if !prefix.is_empty() {
std::ptr::copy_nonoverlapping(prefix.as_ptr(), dst, prefix.len());
}
if !line.is_empty() {
std::ptr::copy_nonoverlapping(line.as_ptr(), dst.add(prefix.len()), line.len());
}
*dst.add(prefix.len() + line.len()) = delim;
buf.set_len(start + total);
}
}
#[inline(always)]
fn ensure_capacity(buf: &mut Vec<u8>, needed: usize) {
let avail = buf.capacity() - buf.len();
if avail < needed {
buf.reserve(needed + 64 * 1024);
}
}
fn comm_identical(
data: &[u8],
config: &CommConfig,
delim: u8,
sep: &[u8],
out: &mut impl Write,
) -> io::Result<CommResult> {
let show3 = !config.suppress_col3;
let stripped = if !data.is_empty() && data.last() == Some(&delim) {
&data[..data.len() - 1]
} else {
data
};
let line_count = if stripped.is_empty() {
0
} else {
memchr::memchr_iter(delim, stripped).count() + 1
};
if show3 {
let mut prefix = Vec::new();
if !config.suppress_col1 {
prefix.extend_from_slice(sep);
}
if !config.suppress_col2 {
prefix.extend_from_slice(sep);
}
let mut buf: Vec<u8> = Vec::with_capacity(256 * 1024);
let mut pos = 0;
for nl_pos in memchr::memchr_iter(delim, stripped) {
let line = &stripped[pos..nl_pos];
let needed = prefix.len() + line.len() + 1;
if buf.len() + needed > 192 * 1024 {
out.write_all(&buf)?;
buf.clear();
}
if buf.capacity() - buf.len() < needed {
buf.reserve(needed + 64 * 1024);
}
unsafe {
write_line(&mut buf, &prefix, line, delim);
}
pos = nl_pos + 1;
}
if pos < stripped.len() {
let line = &stripped[pos..];
let needed = prefix.len() + line.len() + 1;
if buf.capacity() - buf.len() < needed {
buf.reserve(needed + 1024);
}
unsafe {
write_line(&mut buf, &prefix, line, delim);
}
}
if !buf.is_empty() {
out.write_all(&buf)?;
}
}
Ok(CommResult {
count1: 0,
count2: 0,
count3: line_count,
had_order_error: false,
})
}
pub fn comm(
data1: &[u8],
data2: &[u8],
config: &CommConfig,
tool_name: &str,
out: &mut impl Write,
) -> io::Result<CommResult> {
let delim = if config.zero_terminated { b'\0' } else { b'\n' };
let sep = config.output_delimiter.as_deref().unwrap_or(b"\t");
if data1 == data2
&& !config.case_insensitive
&& !config.total
&& config.order_check == OrderCheck::None
{
return comm_identical(data1, config, delim, sep, out);
}
let prefix1: &[u8] = &[];
let prefix2_owned: Vec<u8> = if !config.suppress_col1 {
sep.to_vec()
} else {
Vec::new()
};
let mut prefix3_owned: Vec<u8> = Vec::new();
if !config.suppress_col1 {
prefix3_owned.extend_from_slice(sep);
}
if !config.suppress_col2 {
prefix3_owned.extend_from_slice(sep);
}
let show1 = !config.suppress_col1;
let show2 = !config.suppress_col2;
let show3 = !config.suppress_col3;
let ci = config.case_insensitive;
let check_order = config.order_check != OrderCheck::None;
let strict = config.order_check == OrderCheck::Strict;
let buf_cap = 256 * 1024;
let mut buf: Vec<u8> = Vec::with_capacity(buf_cap);
let flush_threshold = 192 * 1024;
let mut count1 = 0usize;
let mut count2 = 0usize;
let mut count3 = 0usize;
let mut had_order_error = false;
let mut warned1 = false;
let mut warned2 = false;
let len1 = if !data1.is_empty() && data1.last() == Some(&delim) {
data1.len() - 1
} else {
data1.len()
};
let len2 = if !data2.is_empty() && data2.last() == Some(&delim) {
data2.len() - 1
} else {
data2.len()
};
let mut iter1 = memchr::memchr_iter(delim, &data1[..len1]);
let mut iter2 = memchr::memchr_iter(delim, &data2[..len2]);
let mut pos1 = 0usize;
let mut pos2 = 0usize;
let mut end1 = iter1.next().unwrap_or(len1);
let mut end2 = iter2.next().unwrap_or(len2);
let mut prev1: &[u8] = &[];
let mut has_prev1 = false;
let mut prev2: &[u8] = &[];
let mut has_prev2 = false;
while pos1 < len1 && pos2 < len2 {
let line1 = &data1[pos1..end1];
let line2 = &data2[pos2..end2];
match compare_lines(line1, line2, ci) {
Ordering::Less => {
if check_order
&& !warned1
&& has_prev1
&& compare_lines(line1, prev1, ci) == Ordering::Less
{
had_order_error = true;
warned1 = true;
eprintln!("{}: file {} is not in sorted order", tool_name, 1);
if strict {
out.write_all(&buf)?;
return Ok(CommResult {
count1,
count2,
count3,
had_order_error,
});
}
}
if show1 {
ensure_capacity(&mut buf, prefix1.len() + line1.len() + 1);
unsafe {
write_line(&mut buf, prefix1, line1, delim);
}
}
count1 += 1;
prev1 = line1;
has_prev1 = true;
pos1 = end1 + 1;
end1 = iter1.next().unwrap_or(len1);
}
Ordering::Greater => {
if check_order
&& !warned2
&& has_prev2
&& compare_lines(line2, prev2, ci) == Ordering::Less
{
had_order_error = true;
warned2 = true;
eprintln!("{}: file {} is not in sorted order", tool_name, 2);
if strict {
out.write_all(&buf)?;
return Ok(CommResult {
count1,
count2,
count3,
had_order_error,
});
}
}
if show2 {
ensure_capacity(&mut buf, prefix2_owned.len() + line2.len() + 1);
unsafe {
write_line(&mut buf, &prefix2_owned, line2, delim);
}
}
count2 += 1;
prev2 = line2;
has_prev2 = true;
pos2 = end2 + 1;
end2 = iter2.next().unwrap_or(len2);
}
Ordering::Equal => {
if show3 {
ensure_capacity(&mut buf, prefix3_owned.len() + line1.len() + 1);
unsafe {
write_line(&mut buf, &prefix3_owned, line1, delim);
}
}
count3 += 1;
prev1 = line1;
has_prev1 = true;
prev2 = line2;
has_prev2 = true;
pos1 = end1 + 1;
end1 = iter1.next().unwrap_or(len1);
pos2 = end2 + 1;
end2 = iter2.next().unwrap_or(len2);
}
}
if buf.len() >= flush_threshold {
out.write_all(&buf)?;
buf.clear();
}
}
if pos1 < len1 && show1 && (!check_order || warned1) && prefix1.is_empty() {
let remaining = &data1[pos1..len1];
let line_count = memchr::memchr_iter(delim, remaining).count();
let has_trailing = !remaining.is_empty() && remaining.last() != Some(&delim);
count1 += line_count + if has_trailing { 1 } else { 0 };
if !buf.is_empty() {
out.write_all(&buf)?;
buf.clear();
}
out.write_all(remaining)?;
if has_trailing {
out.write_all(&[delim])?;
}
pos1 = len1;
}
while pos1 < len1 {
let line1 = &data1[pos1..end1];
if check_order && !warned1 && has_prev1 && compare_lines(line1, prev1, ci) == Ordering::Less
{
had_order_error = true;
warned1 = true;
eprintln!("{}: file 1 is not in sorted order", tool_name);
if strict {
out.write_all(&buf)?;
return Ok(CommResult {
count1,
count2,
count3,
had_order_error,
});
}
}
if show1 {
ensure_capacity(&mut buf, line1.len() + 1);
unsafe {
write_line(&mut buf, prefix1, line1, delim);
}
}
count1 += 1;
prev1 = line1;
has_prev1 = true;
pos1 = end1 + 1;
end1 = iter1.next().unwrap_or(len1);
if buf.len() >= flush_threshold {
out.write_all(&buf)?;
buf.clear();
}
}
if pos2 < len2
&& show2
&& (!check_order || warned2)
&& (config.suppress_col1 || prefix2_owned.is_empty())
{
let remaining = &data2[pos2..len2];
if prefix2_owned.is_empty() {
let line_count = memchr::memchr_iter(delim, remaining).count();
let has_trailing = !remaining.is_empty() && remaining.last() != Some(&delim);
count2 += line_count + if has_trailing { 1 } else { 0 };
if !buf.is_empty() {
out.write_all(&buf)?;
buf.clear();
}
out.write_all(remaining)?;
if has_trailing {
out.write_all(&[delim])?;
}
pos2 = len2;
}
}
while pos2 < len2 {
let line2 = &data2[pos2..end2];
if check_order && !warned2 && has_prev2 && compare_lines(line2, prev2, ci) == Ordering::Less
{
had_order_error = true;
warned2 = true;
eprintln!("{}: file 2 is not in sorted order", tool_name);
if strict {
out.write_all(&buf)?;
return Ok(CommResult {
count1,
count2,
count3,
had_order_error,
});
}
}
if show2 {
ensure_capacity(&mut buf, prefix2_owned.len() + line2.len() + 1);
unsafe {
write_line(&mut buf, &prefix2_owned, line2, delim);
}
}
count2 += 1;
prev2 = line2;
has_prev2 = true;
pos2 = end2 + 1;
end2 = iter2.next().unwrap_or(len2);
if buf.len() >= flush_threshold {
out.write_all(&buf)?;
buf.clear();
}
}
if config.total {
let mut itoa_buf = itoa::Buffer::new();
buf.extend_from_slice(itoa_buf.format(count1).as_bytes());
buf.extend_from_slice(sep);
buf.extend_from_slice(itoa_buf.format(count2).as_bytes());
buf.extend_from_slice(sep);
buf.extend_from_slice(itoa_buf.format(count3).as_bytes());
buf.extend_from_slice(sep);
buf.extend_from_slice(b"total");
buf.push(delim);
}
if had_order_error && config.order_check == OrderCheck::Default {
eprintln!("{}: input is not in sorted order", tool_name);
}
out.write_all(&buf)?;
Ok(CommResult {
count1,
count2,
count3,
had_order_error,
})
}