use std::collections::VecDeque;
use std::fs::File;
use std::io::{self, Read};
use std::str::Utf8Error;
macro_rules! cow {
($e: expr) => {
std::borrow::Cow::from($e)
};
}
#[derive(Default, Clone, Copy, Debug)]
struct Counters {
len4: usize,
len3: usize,
len2: usize,
ascii: usize,
nul: usize,
}
impl Counters {
fn count(chunk: &[u8]) -> Counters {
let mut counts = Counters::default();
for b in chunk {
if b & 0b11_00_0000 != 0b10_00_0000 {
match *b {
b if b == 0b0000_0000 => counts.nul += 1,
b if b & 0b1000_0000 == 0b0000_0000 => counts.ascii += 1,
b if b & 0b1110_0000 == 0b1100_0000 => counts.len2 += 1,
b if b & 0b1111_0000 == 0b1110_0000 => counts.len3 += 1,
b if b & 0b1111_1000 == 0b1111_0000 => counts.len4 += 1,
b => unreachable!("found non-continuation byte that is not nul/ascii/len(2,3,4)-utf8 char: {:X?}", b),
}
}
}
if chunk.len() != counts.bytes() {
match std::str::from_utf8(chunk) {
Ok(_) => panic!("failed to accurately count byte types for valid UTF-8 file (total = {}, calculated = {})", chunk.len(), counts.bytes()),
Err(_) => {
eprintln!("invalid UTF-8 passed, converting string to lossy UTF-8. Count may be off (current chunk byte count off by {})", chunk.len() as isize - counts.bytes() as isize);
},
}
}
counts
}
fn chars(&self) -> usize {
self.len4 + self.len3 + self.len2 + self.ascii + self.nul
}
fn bytes(&self) -> usize {
self.len4 * 4 + self.len3 * 3 + self.len2 * 2 + self.ascii + self.nul
}
fn as_json(&self) -> String {
format!(
"{{ \"nul\": {}, \"ascii\": {}, \"len2\": {}, \"len3\": {}, \"len4\": {} }}",
self.nul, self.ascii, self.len2, self.len3, self.len4
)
}
}
impl std::ops::Add<Counters> for Counters {
type Output = Counters;
#[inline]
fn add(self, rhs: Counters) -> Counters {
Counters {
len4: self.len4 + rhs.len4,
len3: self.len3 + rhs.len3,
len2: self.len2 + rhs.len2,
ascii: self.ascii + rhs.ascii,
nul: self.nul + rhs.nul,
}
}
}
impl std::ops::AddAssign<Counters> for Counters {
#[inline]
fn add_assign(&mut self, rhs: Counters) {
*self = *self + rhs;
}
}
impl<'b> std::ops::AddAssign<&'b [u8]> for Counters {
fn add_assign(&mut self, rhs: &'b [u8]) {
*self = *self + Counters::count(rhs)
}
}
fn main() {
let exit_code = real_main();
std::process::exit(exit_code);
}
fn real_main() -> i32 {
let mut opts: VecDeque<_> = std::env::args().collect();
let exe = opts
.pop_front()
.map(|s| cow!(s))
.unwrap_or_else(|| cow!("count_bytes"));
if opts.iter().any(|s| s == "--help") {
eprintln!("USAGE: {exe:?} [--json] [file] [KB_chunk_size=4]");
eprintln!();
eprintln!("Reads a file and emits the number of occurences differing lengths of UTF-8 codepoints on stdout.");
eprintln!();
eprintln!("If `file` is not supplied, stdin will be read.");
eprintln!();
eprintln!("Exit Codes:");
eprintln!("0\tSuccess");
eprintln!("1\tCommand-line error");
eprintln!("2\tIO Error");
return 1;
}
let output_json = if let Some(i) = opts.iter().position(|s| s == "--json") {
let _ = opts.remove(i).unwrap();
true
} else {
false
};
let fname = opts.pop_front();
let chunk_size: Option<usize> = match opts
.pop_front()
.map(|s| s.parse::<usize>().map_err(|e| (s, e)))
.transpose()
{
Ok(Some(0)) => {
eprintln!("supplied zero-length chunk size. using default chunk size.");
None
}
Ok(o) => o,
Err((s, e)) => {
eprintln!("USAGE: {exe} [--json] [file] [KB_chunk_size=4]");
eprintln!("error parsing argument `chunk_size` (provided: {s:?}) as positive integer: {e}",);
return 1;
}
};
let chunk = chunk_size.unwrap_or(4) * 1024;
let count: io::Result<Result<Counters, Utf8Error>> = match fname.as_deref() {
None | Some("-") => {
let stdin = io::stdin();
let mut stdin = stdin.lock();
readloop(&mut stdin, chunk)
}
Some(fname) => File::open(fname).and_then(|mut f| readloop(&mut f, chunk)),
};
match count {
Ok(Ok(n)) if output_json => {
println!("{}", n.as_json());
0
}
Ok(Ok(n)) => {
println!("len4: {}", n.len4);
println!("len3: {}", n.len3);
println!("len2: {}", n.len2);
println!("ascii: {}", n.ascii);
println!("nul: {}", n.nul);
println!("total (chars/bytes): {}/{}", n.chars(), n.bytes());
0
}
Ok(Err(e)) => {
eprintln!("error reading input: {e}");
2
}
Err(e) => {
eprintln!("error reading input: {e}");
2
}
}
}
fn _count_chunk_bytes(chunk: &[u8]) -> usize {
chunk
.iter()
.filter(|&b| b & 0b1111_1000 == 0b1111_0000)
.count()
}
fn assumed_utf8_chunk(total_i: usize, chunk: &[u8]) -> usize {
if chunk.is_empty() {
return 0;
}
for (i, &b) in chunk.iter().enumerate().rev() {
let utf_len = match b {
b if b & 0b1000_0000 == 0b0000_0000 => return i+1, b if b & 0b1110_0000 == 0b1100_0000 => 2, b if b & 0b1111_0000 == 0b1110_0000 => 3, b if b & 0b1111_1000 == 0b1111_0000 => 4, b if b & 0b1100_0000 == 0b1000_0000 => { continue; }, b => unreachable!("found non-continuation byte that is not nul/ascii/len(2,3,4)-utf8 char: {:X?} (at {:x})", b, total_i + i),
};
if i + utf_len > chunk.len() {
return i;
} else {
return i + utf_len;
}
}
if chunk.is_empty() {
0
} else {
panic!("chunk only contains continuation bytes?");
}
}
fn readloop(input: &mut dyn Read, chunk: usize) -> io::Result<Result<Counters, Utf8Error>> {
let bufsize: usize = chunk;
let mut summative = Counters::default();
let mut buf = vec![0; bufsize];
let mut buffered = 0;
let mut total_i = 0;
loop {
match input.read(&mut buf[buffered..])? {
0 => {
break;
}
n => {
let valid_up_to = assumed_utf8_chunk(total_i, &buf[..buffered + n]);
let valid = &buf[..valid_up_to];
summative += Counters::count(valid);
buf.copy_within(valid_up_to..buffered + n, 0);
buffered = buffered + n - valid_up_to;
total_i += n;
}
}
}
if buffered == 0 {
Ok(Ok(summative))
} else {
let uerr = std::str::from_utf8(&buf[..buffered]).expect_err(
"if there is buffered data left, there should be a UTF-8 error of some sort",
);
Ok(Err(uerr))
}
}