coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (2MB).
5/// Lower threshold lets us exploit 4 cores on smaller files.
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Results from counting a byte slice.
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
10pub struct WcCounts {
11    pub lines: u64,
12    pub words: u64,
13    pub bytes: u64,
14    pub chars: u64,
15    pub max_line_length: u64,
16}
17
18/// Non-word lookup table for UTF-8 locale word boundary detection.
19/// In UTF-8 locale, GNU wc uses mbrtowc() + iswspace(). Non-word bytes:
20/// - C0 control chars (0x00-0x1F): non-printable, iswprint() = false
21/// - Space (0x20): iswspace() = true
22/// - DEL (0x7F): non-printable
23/// - Invalid standalone UTF-8: 0xC0, 0xC1, 0xFE, 0xFF (mbrtowc fails)
24/// Valid UTF-8 continuation (0x80-0xBF) and leader bytes (0xC2-0xFD) remain as word content.
25const fn make_ws_table_utf8() -> [u8; 256] {
26    let mut t = [0u8; 256];
27    // All C0 control characters (0x00-0x1F) are non-word
28    let mut i = 0u16;
29    while i <= 0x1F {
30        t[i as usize] = 1;
31        i += 1;
32    }
33    t[0x20] = 1; // space
34    t[0x7F] = 1; // DEL
35    // Invalid UTF-8 standalone bytes
36    t[0xC0] = 1; // overlong encoding
37    t[0xC1] = 1; // overlong encoding
38    t[0xFE] = 1; // invalid UTF-8
39    t[0xFF] = 1; // invalid UTF-8
40    t
41}
42
43/// Non-word lookup table for C/POSIX locale word boundary detection.
44/// In C locale, GNU wc uses mbrtowc() which fails on bytes >= 0x80, treating them
45/// as non-word characters. Also treats all control chars (0x01-0x08, 0x0E-0x1F, 0x7F)
46/// as non-word, matching the behavior where only printable ASCII forms words.
47const fn make_ws_table_c() -> [u8; 256] {
48    let mut t = [1u8; 256]; // default: non-word
49    // Only printable ASCII (0x21-0x7E) are word characters (not space 0x20)
50    let mut i = 0x21u16;
51    while i <= 0x7E {
52        t[i as usize] = 0;
53        i += 1;
54    }
55    t
56}
57
58/// UTF-8 locale: only standard whitespace + null are non-word
59const WS_TABLE_UTF8: [u8; 256] = make_ws_table_utf8();
60
61/// C locale: all non-printable-ASCII bytes are non-word (bytes >= 0x80, controls, etc.)
62const WS_TABLE_C: [u8; 256] = make_ws_table_c();
63
64/// Get the appropriate word-boundary table for the current locale.
65/// This is set once at startup and used throughout.
66#[inline]
67pub fn ws_table(utf8: bool) -> &'static [u8; 256] {
68    if utf8 { &WS_TABLE_UTF8 } else { &WS_TABLE_C }
69}
70
71/// Printable ASCII lookup table: 0x20 (space) through 0x7E (~) are printable.
72const fn make_printable_table() -> [u8; 256] {
73    let mut t = [0u8; 256];
74    let mut i = 0x20u16;
75    while i <= 0x7E {
76        t[i as usize] = 1;
77        i += 1;
78    }
79    t
80}
81
82const PRINTABLE_TABLE: [u8; 256] = make_printable_table();
83
84/// Count newlines using SIMD-accelerated memchr.
85/// GNU wc counts newline bytes (`\n`), not logical lines.
86#[inline]
87pub fn count_lines(data: &[u8]) -> u64 {
88    memchr_iter(b'\n', data).count() as u64
89}
90
91/// Count bytes. Trivial but included for API consistency.
92#[inline]
93pub fn count_bytes(data: &[u8]) -> u64 {
94    data.len() as u64
95}
96
97/// Count words using locale-aware whitespace detection.
98///
99/// A word is a maximal run of word-character bytes.
100/// In UTF-8 locale: word chars are non-whitespace bytes (high bytes are part of multi-byte chars).
101/// In C locale: word chars are only printable ASCII (0x21-0x7E). High bytes are non-word.
102///
103/// On x86_64 in UTF-8 mode, uses SSE2 SIMD acceleration.
104/// In C locale or non-x86_64, uses scalar table lookup with 64-byte block bitmasks.
105pub fn count_words(data: &[u8]) -> u64 {
106    count_words_locale(data, true)
107}
108
109/// Count words with explicit locale control.
110/// C locale uses scalar table-based path.
111/// UTF-8 locale uses a state machine for correct multi-byte sequence handling.
112pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
113    if utf8 {
114        count_words_utf8(data)
115    } else {
116        count_words_with_table(data, &WS_TABLE_C)
117    }
118}
119
120/// Count words in UTF-8 locale using a state machine.
121/// Correctly handles:
122/// - Valid multi-byte UTF-8 sequences (word content)
123/// - Invalid standalone continuation bytes (non-word, mbrtowc fails)
124/// - Control characters (non-word)
125/// - Invalid UTF-8 leaders without proper continuations (non-word)
126fn count_words_utf8(data: &[u8]) -> u64 {
127    let mut words = 0u64;
128    let mut in_word = false;
129    let mut i = 0;
130
131    while i < data.len() {
132        let b = data[i];
133
134        if b <= 0x20 || b == 0x7F {
135            // Control chars (0x00-0x1F), space (0x20), DEL (0x7F): non-word
136            in_word = false;
137            i += 1;
138        } else if b < 0x80 {
139            // Printable ASCII (0x21-0x7E): word content
140            if !in_word {
141                in_word = true;
142                words += 1;
143            }
144            i += 1;
145        } else if b < 0xC2 {
146            // 0x80-0xBF: standalone continuation byte (invalid UTF-8)
147            // 0xC0-0xC1: overlong 2-byte (invalid UTF-8)
148            in_word = false;
149            i += 1;
150        } else if b < 0xE0 {
151            // 2-byte sequence: need 1 continuation byte
152            if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
153                if !in_word {
154                    in_word = true;
155                    words += 1;
156                }
157                i += 2;
158            } else {
159                in_word = false;
160                i += 1;
161            }
162        } else if b < 0xF0 {
163            // 3-byte sequence: need 2 continuation bytes
164            if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
165                if !in_word {
166                    in_word = true;
167                    words += 1;
168                }
169                i += 3;
170            } else {
171                in_word = false;
172                i += 1;
173            }
174        } else if b < 0xF5 {
175            // 4-byte sequence: need 3 continuation bytes
176            if i + 3 < data.len()
177                && (data[i + 1] & 0xC0) == 0x80
178                && (data[i + 2] & 0xC0) == 0x80
179                && (data[i + 3] & 0xC0) == 0x80
180            {
181                if !in_word {
182                    in_word = true;
183                    words += 1;
184                }
185                i += 4;
186            } else {
187                in_word = false;
188                i += 1;
189            }
190        } else {
191            // 0xF5-0xFF: invalid UTF-8
192            in_word = false;
193            i += 1;
194        }
195    }
196
197    words
198}
199
200/// Scalar word counting with a given non-word lookup table.
201fn count_words_with_table(data: &[u8], table: &[u8; 256]) -> u64 {
202    let mut words = 0u64;
203    let mut prev_ws_bit = 1u64;
204
205    let chunks = data.chunks_exact(64);
206    let remainder = chunks.remainder();
207
208    for chunk in chunks {
209        let mut ws_mask = 0u64;
210        let mut i = 0;
211        while i + 7 < 64 {
212            ws_mask |= (table[chunk[i] as usize] as u64) << i;
213            ws_mask |= (table[chunk[i + 1] as usize] as u64) << (i + 1);
214            ws_mask |= (table[chunk[i + 2] as usize] as u64) << (i + 2);
215            ws_mask |= (table[chunk[i + 3] as usize] as u64) << (i + 3);
216            ws_mask |= (table[chunk[i + 4] as usize] as u64) << (i + 4);
217            ws_mask |= (table[chunk[i + 5] as usize] as u64) << (i + 5);
218            ws_mask |= (table[chunk[i + 6] as usize] as u64) << (i + 6);
219            ws_mask |= (table[chunk[i + 7] as usize] as u64) << (i + 7);
220            i += 8;
221        }
222
223        let prev_mask = (ws_mask << 1) | prev_ws_bit;
224        let word_starts = prev_mask & !ws_mask;
225        words += word_starts.count_ones() as u64;
226        prev_ws_bit = (ws_mask >> 63) & 1;
227    }
228
229    let mut prev_ws = prev_ws_bit as u8;
230    for &b in remainder {
231        let curr_ws = table[b as usize];
232        words += (prev_ws & (curr_ws ^ 1)) as u64;
233        prev_ws = curr_ws;
234    }
235    words
236}
237
238/// SSE2-accelerated word counting (UTF-8 locale only).
239/// NOTE: Currently unused in favor of scalar table-based approach for correctness.
240/// Kept for potential future re-enablement after updating to handle all non-word bytes.
241#[cfg(target_arch = "x86_64")]
242#[target_feature(enable = "sse2")]
243#[allow(dead_code)]
244unsafe fn count_words_sse2(data: &[u8]) -> u64 {
245    use std::arch::x86_64::*;
246
247    unsafe {
248        // Whitespace = (b == 0x00) || (0x09 <= b <= 0x0D) || (b == 0x20)
249        // Null bytes are word separators in GNU wc.
250        // Using signed comparison: cmpgt(b, 0x08) && cmpgt(0x0E, b) || cmpeq(b, 0x20) || cmpeq(b, 0x00)
251        let zero = _mm_setzero_si128();
252        let min_ws = _mm_set1_epi8(0x08); // one below \t
253        let max_ws = _mm_set1_epi8(0x0E); // one above \r
254        let space = _mm_set1_epi8(0x20);
255
256        let mut words = 0u64;
257        let mut prev_ws_bit = 1u64; // treat start-of-data as whitespace
258
259        let chunks = data.chunks_exact(64);
260        let remainder = chunks.remainder();
261
262        for chunk in chunks {
263            let ptr = chunk.as_ptr();
264
265            // Load 4 x 16-byte vectors
266            let v0 = _mm_loadu_si128(ptr as *const __m128i);
267            let v1 = _mm_loadu_si128(ptr.add(16) as *const __m128i);
268            let v2 = _mm_loadu_si128(ptr.add(32) as *const __m128i);
269            let v3 = _mm_loadu_si128(ptr.add(48) as *const __m128i);
270
271            // Detect whitespace in each vector: range check + space + null
272            macro_rules! detect_ws {
273                ($v:expr) => {{
274                    let ge_9 = _mm_cmpgt_epi8($v, min_ws);
275                    let le_d = _mm_cmpgt_epi8(max_ws, $v);
276                    let in_range = _mm_and_si128(ge_9, le_d);
277                    let is_sp = _mm_cmpeq_epi8($v, space);
278                    let is_null = _mm_cmpeq_epi8($v, zero);
279                    let ws = _mm_or_si128(in_range, is_sp);
280                    _mm_or_si128(ws, is_null)
281                }};
282            }
283
284            let ws0 = detect_ws!(v0);
285            let ws1 = detect_ws!(v1);
286            let ws2 = detect_ws!(v2);
287            let ws3 = detect_ws!(v3);
288
289            // Combine 4 x 16-bit movemasks into one 64-bit whitespace mask
290            let m0 = (_mm_movemask_epi8(ws0) as u16) as u64;
291            let m1 = (_mm_movemask_epi8(ws1) as u16) as u64;
292            let m2 = (_mm_movemask_epi8(ws2) as u16) as u64;
293            let m3 = (_mm_movemask_epi8(ws3) as u16) as u64;
294            let ws_mask = m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
295
296            // Word starts: where previous byte was whitespace AND current is NOT
297            let prev_mask = (ws_mask << 1) | prev_ws_bit;
298            let word_starts = prev_mask & !ws_mask;
299            words += word_starts.count_ones() as u64;
300
301            prev_ws_bit = (ws_mask >> 63) & 1;
302        }
303
304        // Handle 16-byte sub-chunks of remainder
305        let sub_chunks = remainder.chunks_exact(16);
306        let sub_remainder = sub_chunks.remainder();
307        let mut prev_ws_u32 = prev_ws_bit as u32;
308
309        for chunk in sub_chunks {
310            let v = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
311            let ge_9 = _mm_cmpgt_epi8(v, min_ws);
312            let le_d = _mm_cmpgt_epi8(max_ws, v);
313            let in_range = _mm_and_si128(ge_9, le_d);
314            let is_sp = _mm_cmpeq_epi8(v, space);
315            let is_null = _mm_cmpeq_epi8(v, zero);
316            let ws_vec = _mm_or_si128(_mm_or_si128(in_range, is_sp), is_null);
317            let ws_mask = _mm_movemask_epi8(ws_vec) as u32;
318
319            let prev_mask = (ws_mask << 1) | prev_ws_u32;
320            let word_starts = prev_mask & (!ws_mask & 0xFFFF);
321            words += word_starts.count_ones() as u64;
322            prev_ws_u32 = (ws_mask >> 15) & 1;
323        }
324
325        // Scalar for final <16 bytes
326        let mut prev_ws = prev_ws_u32 as u8;
327        for &b in sub_remainder {
328            let curr_ws = WS_TABLE_UTF8[b as usize];
329            words += (prev_ws & (curr_ws ^ 1)) as u64;
330            prev_ws = curr_ws;
331        }
332        words
333    }
334}
335
336/// Count lines and words in a single pass using 64-byte bitmask blocks.
337pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
338    let table = ws_table(utf8);
339    let mut words = 0u64;
340    let mut lines = 0u64;
341    let mut prev_ws_bit = 1u64;
342
343    let chunks = data.chunks_exact(64);
344    let remainder = chunks.remainder();
345
346    for chunk in chunks {
347        let mut ws_mask = 0u64;
348        let mut nl_mask = 0u64;
349        let mut i = 0;
350        while i + 7 < 64 {
351            let b0 = chunk[i];
352            let b1 = chunk[i + 1];
353            let b2 = chunk[i + 2];
354            let b3 = chunk[i + 3];
355            let b4 = chunk[i + 4];
356            let b5 = chunk[i + 5];
357            let b6 = chunk[i + 6];
358            let b7 = chunk[i + 7];
359            ws_mask |= (table[b0 as usize] as u64) << i;
360            ws_mask |= (table[b1 as usize] as u64) << (i + 1);
361            ws_mask |= (table[b2 as usize] as u64) << (i + 2);
362            ws_mask |= (table[b3 as usize] as u64) << (i + 3);
363            ws_mask |= (table[b4 as usize] as u64) << (i + 4);
364            ws_mask |= (table[b5 as usize] as u64) << (i + 5);
365            ws_mask |= (table[b6 as usize] as u64) << (i + 6);
366            ws_mask |= (table[b7 as usize] as u64) << (i + 7);
367            nl_mask |= ((b0 == b'\n') as u64) << i;
368            nl_mask |= ((b1 == b'\n') as u64) << (i + 1);
369            nl_mask |= ((b2 == b'\n') as u64) << (i + 2);
370            nl_mask |= ((b3 == b'\n') as u64) << (i + 3);
371            nl_mask |= ((b4 == b'\n') as u64) << (i + 4);
372            nl_mask |= ((b5 == b'\n') as u64) << (i + 5);
373            nl_mask |= ((b6 == b'\n') as u64) << (i + 6);
374            nl_mask |= ((b7 == b'\n') as u64) << (i + 7);
375            i += 8;
376        }
377
378        let prev_mask = (ws_mask << 1) | prev_ws_bit;
379        let word_starts = prev_mask & !ws_mask;
380        words += word_starts.count_ones() as u64;
381        lines += nl_mask.count_ones() as u64;
382        prev_ws_bit = (ws_mask >> 63) & 1;
383    }
384
385    let mut prev_ws = prev_ws_bit as u8;
386    for &b in remainder {
387        if b == b'\n' {
388            lines += 1;
389        }
390        let curr_ws = table[b as usize];
391        words += (prev_ws & (curr_ws ^ 1)) as u64;
392        prev_ws = curr_ws;
393    }
394    (lines, words)
395}
396
397/// Count lines, words, and chars in a single pass using 64-byte bitmask blocks.
398pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
399    let table = ws_table(utf8);
400    let mut words = 0u64;
401    let mut lines = 0u64;
402    let mut chars = 0u64;
403    let mut prev_ws_bit = 1u64;
404
405    let chunks = data.chunks_exact(64);
406    let remainder = chunks.remainder();
407
408    for chunk in chunks {
409        let mut ws_mask = 0u64;
410        let mut nl_mask = 0u64;
411        let mut char_mask = 0u64;
412        let mut i = 0;
413        while i + 7 < 64 {
414            let b0 = chunk[i];
415            let b1 = chunk[i + 1];
416            let b2 = chunk[i + 2];
417            let b3 = chunk[i + 3];
418            let b4 = chunk[i + 4];
419            let b5 = chunk[i + 5];
420            let b6 = chunk[i + 6];
421            let b7 = chunk[i + 7];
422
423            ws_mask |= (table[b0 as usize] as u64) << i
424                | (table[b1 as usize] as u64) << (i + 1)
425                | (table[b2 as usize] as u64) << (i + 2)
426                | (table[b3 as usize] as u64) << (i + 3)
427                | (table[b4 as usize] as u64) << (i + 4)
428                | (table[b5 as usize] as u64) << (i + 5)
429                | (table[b6 as usize] as u64) << (i + 6)
430                | (table[b7 as usize] as u64) << (i + 7);
431
432            nl_mask |= ((b0 == b'\n') as u64) << i
433                | ((b1 == b'\n') as u64) << (i + 1)
434                | ((b2 == b'\n') as u64) << (i + 2)
435                | ((b3 == b'\n') as u64) << (i + 3)
436                | ((b4 == b'\n') as u64) << (i + 4)
437                | ((b5 == b'\n') as u64) << (i + 5)
438                | ((b6 == b'\n') as u64) << (i + 6)
439                | ((b7 == b'\n') as u64) << (i + 7);
440
441            if utf8 {
442                char_mask |= (((b0 & 0xC0) != 0x80) as u64) << i
443                    | (((b1 & 0xC0) != 0x80) as u64) << (i + 1)
444                    | (((b2 & 0xC0) != 0x80) as u64) << (i + 2)
445                    | (((b3 & 0xC0) != 0x80) as u64) << (i + 3)
446                    | (((b4 & 0xC0) != 0x80) as u64) << (i + 4)
447                    | (((b5 & 0xC0) != 0x80) as u64) << (i + 5)
448                    | (((b6 & 0xC0) != 0x80) as u64) << (i + 6)
449                    | (((b7 & 0xC0) != 0x80) as u64) << (i + 7);
450            }
451
452            i += 8;
453        }
454        let prev_mask = (ws_mask << 1) | prev_ws_bit;
455        let word_starts = prev_mask & !ws_mask;
456        words += word_starts.count_ones() as u64;
457        lines += nl_mask.count_ones() as u64;
458        chars += char_mask.count_ones() as u64;
459        prev_ws_bit = (ws_mask >> 63) & 1;
460    }
461
462    let mut prev_ws = prev_ws_bit as u8;
463    for &b in remainder {
464        if b == b'\n' {
465            lines += 1;
466        }
467        let curr_ws = table[b as usize];
468        words += (prev_ws & (curr_ws ^ 1)) as u64;
469        prev_ws = curr_ws;
470        if utf8 {
471            chars += ((b & 0xC0) != 0x80) as u64;
472        }
473    }
474    if !utf8 {
475        chars = data.len() as u64;
476    }
477    (lines, words, chars)
478}
479
480/// Count UTF-8 characters by counting non-continuation bytes.
481/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
482/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
483///
484/// Uses 64-byte block processing with popcount for ~4x throughput vs scalar.
485pub fn count_chars_utf8(data: &[u8]) -> u64 {
486    let mut count = 0u64;
487    let chunks = data.chunks_exact(64);
488    let remainder = chunks.remainder();
489
490    for chunk in chunks {
491        // Build 64-bit mask: bit i = 1 if chunk[i] is NOT a continuation byte
492        let mut char_mask = 0u64;
493        let mut i = 0;
494        while i + 7 < 64 {
495            char_mask |= (((chunk[i] & 0xC0) != 0x80) as u64) << i;
496            char_mask |= (((chunk[i + 1] & 0xC0) != 0x80) as u64) << (i + 1);
497            char_mask |= (((chunk[i + 2] & 0xC0) != 0x80) as u64) << (i + 2);
498            char_mask |= (((chunk[i + 3] & 0xC0) != 0x80) as u64) << (i + 3);
499            char_mask |= (((chunk[i + 4] & 0xC0) != 0x80) as u64) << (i + 4);
500            char_mask |= (((chunk[i + 5] & 0xC0) != 0x80) as u64) << (i + 5);
501            char_mask |= (((chunk[i + 6] & 0xC0) != 0x80) as u64) << (i + 6);
502            char_mask |= (((chunk[i + 7] & 0xC0) != 0x80) as u64) << (i + 7);
503            i += 8;
504        }
505        count += char_mask.count_ones() as u64;
506    }
507
508    for &b in remainder {
509        count += ((b & 0xC0) != 0x80) as u64;
510    }
511    count
512}
513
514/// Count characters in C/POSIX locale (each byte is one character).
515#[inline]
516pub fn count_chars_c(data: &[u8]) -> u64 {
517    data.len() as u64
518}
519
520/// Count characters, choosing behavior based on locale.
521#[inline]
522pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
523    if utf8 {
524        count_chars_utf8(data)
525    } else {
526        count_chars_c(data)
527    }
528}
529
530/// Detect if the current locale uses UTF-8 encoding.
531pub fn is_utf8_locale() -> bool {
532    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
533        if let Ok(val) = std::env::var(var) {
534            if !val.is_empty() {
535                let lower = val.to_ascii_lowercase();
536                return lower.contains("utf-8") || lower.contains("utf8");
537            }
538        }
539    }
540    false
541}
542
543/// Decode one UTF-8 character from a byte slice.
544/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
545#[inline]
546fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
547    let b0 = bytes[0];
548    if b0 < 0x80 {
549        return (b0 as u32, 1);
550    }
551    if b0 < 0xC2 {
552        // Continuation byte or overlong 2-byte — invalid as start
553        return (b0 as u32, 1);
554    }
555    if b0 < 0xE0 {
556        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
557            return (b0 as u32, 1);
558        }
559        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
560        return (cp, 2);
561    }
562    if b0 < 0xF0 {
563        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
564            return (b0 as u32, 1);
565        }
566        let cp =
567            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
568        return (cp, 3);
569    }
570    if b0 < 0xF5 {
571        if bytes.len() < 4
572            || bytes[1] & 0xC0 != 0x80
573            || bytes[2] & 0xC0 != 0x80
574            || bytes[3] & 0xC0 != 0x80
575        {
576            return (b0 as u32, 1);
577        }
578        let cp = ((b0 as u32 & 0x07) << 18)
579            | ((bytes[1] as u32 & 0x3F) << 12)
580            | ((bytes[2] as u32 & 0x3F) << 6)
581            | (bytes[3] as u32 & 0x3F);
582        return (cp, 4);
583    }
584    (b0 as u32, 1)
585}
586
587/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
588#[inline]
589fn is_wide_char(cp: u32) -> bool {
590    matches!(cp,
591        0x1100..=0x115F   // Hangul Jamo
592        | 0x231A..=0x231B // Watch, Hourglass
593        | 0x2329..=0x232A // Angle Brackets
594        | 0x23E9..=0x23F3 // Various symbols
595        | 0x23F8..=0x23FA
596        | 0x25FD..=0x25FE
597        | 0x2614..=0x2615
598        | 0x2648..=0x2653
599        | 0x267F
600        | 0x2693
601        | 0x26A1
602        | 0x26AA..=0x26AB
603        | 0x26BD..=0x26BE
604        | 0x26C4..=0x26C5
605        | 0x26CE
606        | 0x26D4
607        | 0x26EA
608        | 0x26F2..=0x26F3
609        | 0x26F5
610        | 0x26FA
611        | 0x26FD
612        | 0x2702
613        | 0x2705
614        | 0x2708..=0x270D
615        | 0x270F
616        | 0x2712
617        | 0x2714
618        | 0x2716
619        | 0x271D
620        | 0x2721
621        | 0x2728
622        | 0x2733..=0x2734
623        | 0x2744
624        | 0x2747
625        | 0x274C
626        | 0x274E
627        | 0x2753..=0x2755
628        | 0x2757
629        | 0x2763..=0x2764
630        | 0x2795..=0x2797
631        | 0x27A1
632        | 0x27B0
633        | 0x27BF
634        | 0x2934..=0x2935
635        | 0x2B05..=0x2B07
636        | 0x2B1B..=0x2B1C
637        | 0x2B50
638        | 0x2B55
639        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
640        | 0x3041..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
641        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
642        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
643        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
644        | 0xAC00..=0xD7A3  // Hangul Syllables
645        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
646        | 0xFE10..=0xFE19  // Vertical Forms
647        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
648        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
649        | 0xFFE0..=0xFFE6  // Fullwidth Signs
650        | 0x1F004
651        | 0x1F0CF
652        | 0x1F170..=0x1F171
653        | 0x1F17E..=0x1F17F
654        | 0x1F18E
655        | 0x1F191..=0x1F19A
656        | 0x1F1E0..=0x1F1FF // Regional Indicators
657        | 0x1F200..=0x1F202
658        | 0x1F210..=0x1F23B
659        | 0x1F240..=0x1F248
660        | 0x1F250..=0x1F251
661        | 0x1F260..=0x1F265
662        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
663        | 0x1F680..=0x1F6FF // Transport Symbols
664        | 0x1F900..=0x1F9FF // Supplemental Symbols
665        | 0x1FA00..=0x1FA6F
666        | 0x1FA70..=0x1FAFF
667        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
668        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
669    )
670}
671
672/// Compute maximum display width of any line (C/POSIX locale).
673///
674/// GNU wc -L behavior in C locale:
675/// - `\n`: line terminator (records max, resets position)
676/// - `\t`: advances to next tab stop (multiple of 8)
677/// - `\r`: carriage return (resets position to 0, same line)
678/// - `\f`: form feed (acts as line terminator like \n)
679/// - Printable ASCII (0x20..0x7E): width 1
680/// - Everything else (controls, high bytes): width 0
681pub fn max_line_length_c(data: &[u8]) -> u64 {
682    let mut max_len: u64 = 0;
683    let mut line_len: u64 = 0; // max position seen on current line
684    let mut linepos: u64 = 0; // current cursor position
685
686    for &b in data {
687        match b {
688            b'\n' => {
689                if line_len > max_len {
690                    max_len = line_len;
691                }
692                linepos = 0;
693                line_len = 0;
694            }
695            b'\t' => {
696                linepos = (linepos + 8) & !7;
697                if linepos > line_len {
698                    line_len = linepos;
699                }
700            }
701            b'\r' => {
702                linepos = 0;
703            }
704            0x0C => {
705                // Form feed: acts as line terminator
706                if line_len > max_len {
707                    max_len = line_len;
708                }
709                linepos = 0;
710                line_len = 0;
711            }
712            _ => {
713                if PRINTABLE_TABLE[b as usize] != 0 {
714                    linepos += 1;
715                    if linepos > line_len {
716                        line_len = linepos;
717                    }
718                }
719                // Non-printable: width 0
720            }
721        }
722    }
723
724    // Handle last line (may not end with \n)
725    if line_len > max_len {
726        max_len = line_len;
727    }
728
729    max_len
730}
731
732/// Compute maximum display width of any line (UTF-8 locale).
733///
734/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
735/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
736pub fn max_line_length_utf8(data: &[u8]) -> u64 {
737    let mut max_len: u64 = 0;
738    let mut line_len: u64 = 0;
739    let mut linepos: u64 = 0;
740    let mut i = 0;
741
742    while i < data.len() {
743        let b = data[i];
744
745        // Fast path for common ASCII
746        if b < 0x80 {
747            match b {
748                b'\n' => {
749                    if line_len > max_len {
750                        max_len = line_len;
751                    }
752                    linepos = 0;
753                    line_len = 0;
754                }
755                b'\t' => {
756                    linepos = (linepos + 8) & !7;
757                    if linepos > line_len {
758                        line_len = linepos;
759                    }
760                }
761                b'\r' => {
762                    linepos = 0;
763                }
764                0x0C => {
765                    // Form feed: line terminator
766                    if line_len > max_len {
767                        max_len = line_len;
768                    }
769                    linepos = 0;
770                    line_len = 0;
771                }
772                0x20..=0x7E => {
773                    // Printable ASCII
774                    linepos += 1;
775                    if linepos > line_len {
776                        line_len = linepos;
777                    }
778                }
779                _ => {
780                    // Non-printable ASCII control chars: width 0
781                }
782            }
783            i += 1;
784        } else {
785            // Multibyte UTF-8
786            let (cp, len) = decode_utf8(&data[i..]);
787
788            // C1 control characters (0x80..0x9F): non-printable
789            if cp <= 0x9F {
790                // width 0
791            } else if is_wide_char(cp) {
792                linepos += 2;
793                if linepos > line_len {
794                    line_len = linepos;
795                }
796            } else {
797                // Regular printable Unicode character: width 1
798                linepos += 1;
799                if linepos > line_len {
800                    line_len = linepos;
801                }
802            }
803            i += len;
804        }
805    }
806
807    // Handle last line
808    if line_len > max_len {
809        max_len = line_len;
810    }
811
812    max_len
813}
814
815/// Compute maximum display width, choosing behavior based on locale.
816#[inline]
817pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
818    if utf8 {
819        max_line_length_utf8(data)
820    } else {
821        max_line_length_c(data)
822    }
823}
824
825/// Count all metrics using optimized individual passes.
826///
827/// Each metric uses its own optimized algorithm:
828/// - Lines: SIMD-accelerated memchr
829/// - Words: branchless lookup table
830/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
831/// - Max line length: locale-aware display width tracking
832///
833/// Multi-pass is faster than single-pass because each pass has a tight,
834/// specialized loop. After the first pass, data is hot in L2/L3 cache,
835/// making subsequent passes nearly free for memory bandwidth.
836pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
837    WcCounts {
838        lines: count_lines(data),
839        words: count_words_locale(data, utf8),
840        bytes: data.len() as u64,
841        chars: count_chars(data, utf8),
842        max_line_length: max_line_length(data, utf8),
843    }
844}
845
846// ──────────────────────────────────────────────────
847// Parallel counting for large files
848// ──────────────────────────────────────────────────
849
850/// Count newlines in parallel using SIMD memchr + rayon.
851pub fn count_lines_parallel(data: &[u8]) -> u64 {
852    if data.len() < PARALLEL_THRESHOLD {
853        return count_lines(data);
854    }
855
856    let num_threads = rayon::current_num_threads().max(1);
857    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
858
859    data.par_chunks(chunk_size)
860        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
861        .sum()
862}
863
864/// Count words in parallel with boundary adjustment.
865pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
866    if utf8 || data.len() < PARALLEL_THRESHOLD {
867        // UTF-8 word counting uses a state machine that can't be trivially parallelized
868        // (multi-byte sequences may span chunk boundaries).
869        return count_words_locale(data, utf8);
870    }
871
872    let table = &WS_TABLE_C;
873    let num_threads = rayon::current_num_threads().max(1);
874    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
875
876    let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
877
878    let results: Vec<(u64, bool, bool)> = chunks
879        .par_iter()
880        .map(|chunk| {
881            let words = count_words_with_table(chunk, table);
882            let starts_non_ws = chunk.first().is_some_and(|&b| table[b as usize] == 0);
883            let ends_non_ws = chunk.last().is_some_and(|&b| table[b as usize] == 0);
884            (words, starts_non_ws, ends_non_ws)
885        })
886        .collect();
887
888    let mut total = 0u64;
889    for i in 0..results.len() {
890        total += results[i].0;
891        if i > 0 && results[i].1 && results[i - 1].2 {
892            total -= 1;
893        }
894    }
895    total
896}
897
898/// Count UTF-8 characters in parallel.
899pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
900    if !utf8 {
901        return data.len() as u64;
902    }
903    if data.len() < PARALLEL_THRESHOLD {
904        return count_chars_utf8(data);
905    }
906
907    let num_threads = rayon::current_num_threads().max(1);
908    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
909
910    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
911}
912
913/// Combined parallel counting of lines + words + chars.
914pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
915    if data.len() < PARALLEL_THRESHOLD {
916        let lines = count_lines(data);
917        let words = count_words_locale(data, utf8);
918        let chars = count_chars(data, utf8);
919        return (lines, words, chars);
920    }
921
922    // Word counting: sequential for UTF-8 (state machine), parallel for C locale
923    let words = count_words_parallel(data, utf8);
924
925    // Lines and chars can always be parallelized safely
926    let num_threads = rayon::current_num_threads().max(1);
927    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
928
929    let lines: u64 = data
930        .par_chunks(chunk_size)
931        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
932        .sum();
933
934    let chars = if utf8 {
935        data.par_chunks(chunk_size).map(count_chars_utf8).sum()
936    } else {
937        data.len() as u64
938    };
939
940    (lines, words, chars)
941}
coreutils_rs/wc/core.rs

coreutils_rs/wc/
core.rs