Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2
3/// Results from counting a byte slice.
4#[derive(Debug, Clone, Default, PartialEq, Eq)]
5pub struct WcCounts {
6    pub lines: u64,
7    pub words: u64,
8    pub bytes: u64,
9    pub chars: u64,
10    pub max_line_length: u64,
11}
12
13/// Count newlines using SIMD-accelerated memchr.
14/// GNU wc counts newline bytes, not logical lines.
15#[inline]
16pub fn count_lines(data: &[u8]) -> u64 {
17    memchr_iter(b'\n', data).count() as u64
18}
19
20/// Count bytes. Trivial but included for API consistency.
21#[inline]
22pub fn count_bytes(data: &[u8]) -> u64 {
23    data.len() as u64
24}
25
26/// Check if a byte is whitespace per GNU wc's definition.
27/// Whitespace: space (0x20), tab (0x09), newline (0x0A), carriage return (0x0D),
28/// form feed (0x0C), vertical tab (0x0B).
29#[inline]
30fn is_word_separator(b: u8) -> bool {
31    matches!(b, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x0B)
32}
33
34/// Count words: a word is a maximal sequence of non-whitespace bytes.
35/// Matches GNU wc behavior using `isspace()` from C locale.
36pub fn count_words(data: &[u8]) -> u64 {
37    let mut words: u64 = 0;
38    let mut in_word = false;
39
40    for &b in data {
41        if is_word_separator(b) {
42            in_word = false;
43        } else if !in_word {
44            in_word = true;
45            words += 1;
46        }
47    }
48    words
49}
50
51/// Count UTF-8 characters. Invalid sequences count each byte as one character,
52/// matching GNU wc -m behavior in C locale.
53pub fn count_chars(data: &[u8]) -> u64 {
54    let mut chars: u64 = 0;
55    let mut i = 0;
56    while i < data.len() {
57        let b = data[i];
58        if b < 0x80 {
59            // ASCII
60            chars += 1;
61            i += 1;
62        } else if b < 0xC0 {
63            // Invalid continuation byte - count as one char
64            chars += 1;
65            i += 1;
66        } else if b < 0xE0 {
67            // 2-byte sequence
68            chars += 1;
69            i += 2;
70        } else if b < 0xF0 {
71            // 3-byte sequence
72            chars += 1;
73            i += 3;
74        } else {
75            // 4-byte sequence
76            chars += 1;
77            i += 4;
78        }
79    }
80    chars
81}
82
83/// Compute max display width of any line.
84/// A "line" is delimited by newline. Width is byte count for C locale.
85/// GNU wc -L in C/POSIX locale counts bytes per line (not display width).
86pub fn max_line_length(data: &[u8]) -> u64 {
87    let mut max_len: u64 = 0;
88    let mut current_len: u64 = 0;
89
90    for &b in data {
91        if b == b'\n' {
92            if current_len > max_len {
93                max_len = current_len;
94            }
95            current_len = 0;
96        } else if b == b'\r' {
97            // CR doesn't contribute to line length
98        } else if b == b'\t' {
99            // Tab advances to next multiple of 8
100            current_len = (current_len + 8) & !7;
101        } else {
102            current_len += 1;
103        }
104    }
105
106    // Handle last line (no trailing newline)
107    if current_len > max_len {
108        max_len = current_len;
109    }
110
111    max_len
112}
113
114/// Count everything in a single pass for maximum efficiency.
115/// When multiple flags are requested, this avoids re-scanning.
116///
117/// Iterates byte-by-byte for line/word/max-line-length counting, and uses
118/// UTF-8 leading byte detection for character counting (a byte is a char
119/// start if it's not a continuation byte 0x80..0xBF).
120pub fn count_all(data: &[u8]) -> WcCounts {
121    let mut lines: u64 = 0;
122    let mut words: u64 = 0;
123    let mut chars: u64 = 0;
124    let mut max_len: u64 = 0;
125    let mut current_line_len: u64 = 0;
126    let mut in_word = false;
127
128    for &b in data {
129        // Line length / max line length tracking
130        if b == b'\n' {
131            lines += 1;
132            if current_line_len > max_len {
133                max_len = current_line_len;
134            }
135            current_line_len = 0;
136        } else if b == b'\r' {
137            // CR: don't add to line length
138        } else if b == b'\t' {
139            current_line_len = (current_line_len + 8) & !7;
140        } else {
141            current_line_len += 1;
142        }
143
144        // Word counting
145        if is_word_separator(b) {
146            in_word = false;
147        } else if !in_word {
148            in_word = true;
149            words += 1;
150        }
151
152        // Character counting: count byte if it's not a UTF-8 continuation byte.
153        // Continuation bytes are 0x80..0xBF (10xxxxxx). Everything else starts
154        // a new character (ASCII, or a multi-byte leading byte, or invalid).
155        if (b & 0xC0) != 0x80 {
156            chars += 1;
157        }
158    }
159
160    // Handle last line without trailing newline
161    if current_line_len > max_len {
162        max_len = current_line_len;
163    }
164
165    WcCounts {
166        lines,
167        words,
168        bytes: data.len() as u64,
169        chars,
170        max_line_length: max_len,
171    }
172}