coreutils_rs/wc/core.rs
1use memchr::memchr_iter;
2
3/// Results from counting a byte slice.
4#[derive(Debug, Clone, Default, PartialEq, Eq)]
5pub struct WcCounts {
6 pub lines: u64,
7 pub words: u64,
8 pub bytes: u64,
9 pub chars: u64,
10 pub max_line_length: u64,
11}
12
13/// Count newlines using SIMD-accelerated memchr.
14/// GNU wc counts newline bytes, not logical lines.
15#[inline]
16pub fn count_lines(data: &[u8]) -> u64 {
17 memchr_iter(b'\n', data).count() as u64
18}
19
20/// Count bytes. Trivial but included for API consistency.
21#[inline]
22pub fn count_bytes(data: &[u8]) -> u64 {
23 data.len() as u64
24}
25
26/// Check if a byte is whitespace per GNU wc's definition.
27/// Whitespace: space (0x20), tab (0x09), newline (0x0A), carriage return (0x0D),
28/// form feed (0x0C), vertical tab (0x0B).
29#[inline]
30fn is_word_separator(b: u8) -> bool {
31 matches!(b, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x0B)
32}
33
34/// Count words: a word is a maximal sequence of non-whitespace bytes.
35/// Matches GNU wc behavior using `isspace()` from C locale.
36pub fn count_words(data: &[u8]) -> u64 {
37 let mut words: u64 = 0;
38 let mut in_word = false;
39
40 for &b in data {
41 if is_word_separator(b) {
42 in_word = false;
43 } else if !in_word {
44 in_word = true;
45 words += 1;
46 }
47 }
48 words
49}
50
51/// Count UTF-8 characters. Invalid sequences count each byte as one character,
52/// matching GNU wc -m behavior in C locale.
53pub fn count_chars(data: &[u8]) -> u64 {
54 let mut chars: u64 = 0;
55 let mut i = 0;
56 while i < data.len() {
57 let b = data[i];
58 if b < 0x80 {
59 // ASCII
60 chars += 1;
61 i += 1;
62 } else if b < 0xC0 {
63 // Invalid continuation byte - count as one char
64 chars += 1;
65 i += 1;
66 } else if b < 0xE0 {
67 // 2-byte sequence
68 chars += 1;
69 i += 2;
70 } else if b < 0xF0 {
71 // 3-byte sequence
72 chars += 1;
73 i += 3;
74 } else {
75 // 4-byte sequence
76 chars += 1;
77 i += 4;
78 }
79 }
80 chars
81}
82
83/// Compute max display width of any line.
84/// A "line" is delimited by newline. Width is byte count for C locale.
85/// GNU wc -L in C/POSIX locale counts bytes per line (not display width).
86pub fn max_line_length(data: &[u8]) -> u64 {
87 let mut max_len: u64 = 0;
88 let mut current_len: u64 = 0;
89
90 for &b in data {
91 if b == b'\n' {
92 if current_len > max_len {
93 max_len = current_len;
94 }
95 current_len = 0;
96 } else if b == b'\r' {
97 // CR doesn't contribute to line length
98 } else if b == b'\t' {
99 // Tab advances to next multiple of 8
100 current_len = (current_len + 8) & !7;
101 } else {
102 current_len += 1;
103 }
104 }
105
106 // Handle last line (no trailing newline)
107 if current_len > max_len {
108 max_len = current_len;
109 }
110
111 max_len
112}
113
114/// Count everything in a single pass for maximum efficiency.
115/// When multiple flags are requested, this avoids re-scanning.
116///
117/// Iterates byte-by-byte for line/word/max-line-length counting, and uses
118/// UTF-8 leading byte detection for character counting (a byte is a char
119/// start if it's not a continuation byte 0x80..0xBF).
120pub fn count_all(data: &[u8]) -> WcCounts {
121 let mut lines: u64 = 0;
122 let mut words: u64 = 0;
123 let mut chars: u64 = 0;
124 let mut max_len: u64 = 0;
125 let mut current_line_len: u64 = 0;
126 let mut in_word = false;
127
128 for &b in data {
129 // Line length / max line length tracking
130 if b == b'\n' {
131 lines += 1;
132 if current_line_len > max_len {
133 max_len = current_line_len;
134 }
135 current_line_len = 0;
136 } else if b == b'\r' {
137 // CR: don't add to line length
138 } else if b == b'\t' {
139 current_line_len = (current_line_len + 8) & !7;
140 } else {
141 current_line_len += 1;
142 }
143
144 // Word counting
145 if is_word_separator(b) {
146 in_word = false;
147 } else if !in_word {
148 in_word = true;
149 words += 1;
150 }
151
152 // Character counting: count byte if it's not a UTF-8 continuation byte.
153 // Continuation bytes are 0x80..0xBF (10xxxxxx). Everything else starts
154 // a new character (ASCII, or a multi-byte leading byte, or invalid).
155 if (b & 0xC0) != 0x80 {
156 chars += 1;
157 }
158 }
159
160 // Handle last line without trailing newline
161 if current_line_len > max_len {
162 max_len = current_line_len;
163 }
164
165 WcCounts {
166 lines,
167 words,
168 bytes: data.len() as u64,
169 chars,
170 max_line_length: max_len,
171 }
172}