xwc 0.3.0

A small wc-style command line tool
Documentation
use std::{
    io::{self, Read},
    mem, str,
};

use memchr::memchr_iter;

const BUFFER_SIZE: usize = 64 * 1024;

#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct Counts {
    pub lines: u64,
    pub words: u64,
    pub bytes: u64,
}

#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct CountOptions {
    pub lines: bool,
    pub words: bool,
}

#[derive(Debug, Default)]
struct WordState {
    in_word: bool,
    pending_utf8: Vec<u8>,
}

pub fn count_reader(mut reader: impl Read, options: CountOptions) -> io::Result<Counts> {
    let mut counts = Counts::default();
    let mut buffer = [0; BUFFER_SIZE];
    let mut word_state = WordState::default();

    loop {
        let read = reader.read(&mut buffer)?;
        if read == 0 {
            break;
        }

        counts.bytes += read as u64;
        let chunk = buffer
            .get(..read)
            .expect("Read::read cannot report more bytes than the buffer holds");

        if options.lines {
            counts.lines += bytecount_newlines(chunk) as u64;
        }

        if options.words {
            counts.words += count_words(chunk, &mut word_state) as u64;
        }
    }

    if options.words && !word_state.pending_utf8.is_empty() && !word_state.in_word {
        counts.words += 1;
    }

    Ok(counts)
}

fn bytecount_newlines(buffer: &[u8]) -> usize {
    memchr_iter(b'\n', buffer).count()
}

fn count_words(buffer: &[u8], state: &mut WordState) -> usize {
    let mut words = 0;
    let combined;

    let buffer = if state.pending_utf8.is_empty() {
        buffer
    } else {
        combined = {
            let mut bytes = mem::take(&mut state.pending_utf8);
            bytes.extend_from_slice(buffer);
            bytes
        };
        &combined
    };

    let mut offset = 0;

    while offset < buffer.len() {
        let remaining = buffer
            .get(offset..)
            .expect("offset is guarded by the loop condition");

        match str::from_utf8(remaining) {
            Ok(valid) => {
                words += count_words_in_str(valid, &mut state.in_word);
                break;
            }
            Err(error) => {
                let valid_end = offset + error.valid_up_to();
                let valid_bytes = buffer
                    .get(offset..valid_end)
                    .expect("valid_up_to returns an in-bounds offset");
                let valid = str::from_utf8(valid_bytes)
                    .expect("valid_up_to must split at a UTF-8 boundary");
                words += count_words_in_str(valid, &mut state.in_word);
                offset = valid_end;

                if let Some(error_len) = error.error_len() {
                    if !state.in_word {
                        words += 1;
                        state.in_word = true;
                    }
                    offset += error_len;
                } else {
                    let pending = buffer
                        .get(offset..)
                        .expect("offset is guarded by the loop condition");
                    state.pending_utf8.extend_from_slice(pending);
                    break;
                }
            }
        }
    }

    words
}

fn count_words_in_str(input: &str, in_word: &mut bool) -> usize {
    let mut words = 0;

    for ch in input.chars() {
        if ch.is_whitespace() {
            *in_word = false;
        } else if !*in_word {
            words += 1;
            *in_word = true;
        }
    }

    words
}

impl std::ops::AddAssign for Counts {
    fn add_assign(&mut self, rhs: Self) {
        self.lines += rhs.lines;
        self.words += rhs.words;
        self.bytes += rhs.bytes;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn counts_newlines_and_bytes_without_decoding_utf8() {
        let input = "cafe\ncafé\n東京 京都".as_bytes();

        assert_eq!(
            count_reader(
                input,
                CountOptions {
                    lines: true,
                    words: true
                }
            )
            .unwrap(),
            Counts {
                lines: 2,
                words: 4,
                bytes: 24
            }
        );
    }

    #[test]
    fn skips_word_counting_when_words_are_not_requested() {
        let input = "cafe\ncafé\n東京 京都".as_bytes();

        assert_eq!(
            count_reader(
                input,
                CountOptions {
                    lines: true,
                    words: false
                }
            )
            .unwrap(),
            Counts {
                lines: 2,
                words: 0,
                bytes: 24
            }
        );
    }

    #[test]
    fn skips_line_counting_when_lines_are_not_requested() {
        let input = "one\ntwo\nthree\n".as_bytes();

        assert_eq!(
            count_reader(
                input,
                CountOptions {
                    lines: false,
                    words: false
                }
            )
            .unwrap(),
            Counts {
                lines: 0,
                words: 0,
                bytes: 14
            }
        );
    }

    #[test]
    fn counts_words_across_buffer_boundaries() {
        let mut state = WordState::default();

        assert_eq!(count_words(b"hello", &mut state), 1);
        assert_eq!(count_words(b"world\nagain", &mut state), 1);
        assert_eq!(count_words(b" later", &mut state), 1);
    }

    #[test]
    fn counts_utf8_words_across_buffer_boundaries() {
        let mut state = WordState::default();
        let input = "東京 京都".as_bytes();

        assert_eq!(count_words(&input[..4], &mut state), 1);
        assert_eq!(count_words(&input[4..], &mut state), 1);
    }
}