Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (2MB).
5/// Lower threshold lets us exploit 4 cores on smaller files.
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Results from counting a byte slice.
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
10pub struct WcCounts {
11    pub lines: u64,
12    pub words: u64,
13    pub bytes: u64,
14    pub chars: u64,
15    pub max_line_length: u64,
16}
17
18/// Whitespace lookup table for branchless word boundary detection.
19/// GNU wc uses C locale `isspace()`: space, tab, newline, CR, form feed, vertical tab.
20const fn make_ws_table() -> [u8; 256] {
21    let mut t = [0u8; 256];
22    t[0x09] = 1; // \t  horizontal tab
23    t[0x0A] = 1; // \n  newline
24    t[0x0B] = 1; // \v  vertical tab
25    t[0x0C] = 1; // \f  form feed
26    t[0x0D] = 1; // \r  carriage return
27    t[0x20] = 1; //     space
28    t
29}
30
31/// Precomputed whitespace lookup: `WS_TABLE[byte] == 1` if whitespace, `0` otherwise.
32const WS_TABLE: [u8; 256] = make_ws_table();
33
34/// Printable ASCII lookup table: 0x20 (space) through 0x7E (~) are printable.
35const fn make_printable_table() -> [u8; 256] {
36    let mut t = [0u8; 256];
37    let mut i = 0x20u16;
38    while i <= 0x7E {
39        t[i as usize] = 1;
40        i += 1;
41    }
42    t
43}
44
45const PRINTABLE_TABLE: [u8; 256] = make_printable_table();
46
47/// Count newlines using SIMD-accelerated memchr.
48/// GNU wc counts newline bytes (`\n`), not logical lines.
49#[inline]
50pub fn count_lines(data: &[u8]) -> u64 {
51    memchr_iter(b'\n', data).count() as u64
52}
53
54/// Count bytes. Trivial but included for API consistency.
55#[inline]
56pub fn count_bytes(data: &[u8]) -> u64 {
57    data.len() as u64
58}
59
60/// Count words using SIMD-accelerated whitespace detection + popcount.
61///
62/// A word is a maximal run of non-whitespace bytes (GNU wc definition).
63/// Word starts are transitions from whitespace to non-whitespace.
64///
65/// On x86_64, uses SSE2 range comparisons to detect whitespace in 16-byte
66/// vectors: `(0x09 <= b <= 0x0D) || (b == 0x20)`. Four vectors are processed
67/// per iteration (64 bytes), with movemask combining into a 64-bit bitmask
68/// for popcount-based word boundary detection.
69///
70/// Fallback: scalar 64-byte block bitmask approach with table lookup.
71pub fn count_words(data: &[u8]) -> u64 {
72    #[cfg(target_arch = "x86_64")]
73    {
74        // SSE2 is always available on x86_64
75        return unsafe { count_words_sse2(data) };
76    }
77    #[cfg(not(target_arch = "x86_64"))]
78    {
79        return count_words_scalar(data);
80    }
81}
82
83/// SSE2-accelerated word counting. Processes 64 bytes per iteration using
84/// 4 XMM registers for whitespace detection, then combines into a 64-bit
85/// bitmask for word boundary detection via popcount.
86#[cfg(target_arch = "x86_64")]
87#[target_feature(enable = "sse2")]
88unsafe fn count_words_sse2(data: &[u8]) -> u64 {
89    use std::arch::x86_64::*;
90
91    unsafe {
92        // Whitespace = (0x09 <= b <= 0x0D) || (b == 0x20)
93        // Using signed comparison: cmpgt(b, 0x08) && cmpgt(0x0E, b) || cmpeq(b, 0x20)
94        let min_ws = _mm_set1_epi8(0x08); // one below \t
95        let max_ws = _mm_set1_epi8(0x0E); // one above \r
96        let space = _mm_set1_epi8(0x20);
97
98        let mut words = 0u64;
99        let mut prev_ws_bit = 1u64; // treat start-of-data as whitespace
100
101        let chunks = data.chunks_exact(64);
102        let remainder = chunks.remainder();
103
104        for chunk in chunks {
105            let ptr = chunk.as_ptr();
106
107            // Load 4 x 16-byte vectors
108            let v0 = _mm_loadu_si128(ptr as *const __m128i);
109            let v1 = _mm_loadu_si128(ptr.add(16) as *const __m128i);
110            let v2 = _mm_loadu_si128(ptr.add(32) as *const __m128i);
111            let v3 = _mm_loadu_si128(ptr.add(48) as *const __m128i);
112
113            // Detect whitespace in each vector: 3 comparisons + 1 AND + 1 OR
114            macro_rules! detect_ws {
115                ($v:expr) => {{
116                    let ge_9 = _mm_cmpgt_epi8($v, min_ws);
117                    let le_d = _mm_cmpgt_epi8(max_ws, $v);
118                    let in_range = _mm_and_si128(ge_9, le_d);
119                    let is_sp = _mm_cmpeq_epi8($v, space);
120                    _mm_or_si128(in_range, is_sp)
121                }};
122            }
123
124            let ws0 = detect_ws!(v0);
125            let ws1 = detect_ws!(v1);
126            let ws2 = detect_ws!(v2);
127            let ws3 = detect_ws!(v3);
128
129            // Combine 4 x 16-bit movemasks into one 64-bit whitespace mask
130            let m0 = (_mm_movemask_epi8(ws0) as u16) as u64;
131            let m1 = (_mm_movemask_epi8(ws1) as u16) as u64;
132            let m2 = (_mm_movemask_epi8(ws2) as u16) as u64;
133            let m3 = (_mm_movemask_epi8(ws3) as u16) as u64;
134            let ws_mask = m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
135
136            // Word starts: where previous byte was whitespace AND current is NOT
137            let prev_mask = (ws_mask << 1) | prev_ws_bit;
138            let word_starts = prev_mask & !ws_mask;
139            words += word_starts.count_ones() as u64;
140
141            prev_ws_bit = (ws_mask >> 63) & 1;
142        }
143
144        // Handle 16-byte sub-chunks of remainder
145        let sub_chunks = remainder.chunks_exact(16);
146        let sub_remainder = sub_chunks.remainder();
147        let mut prev_ws_u32 = prev_ws_bit as u32;
148
149        for chunk in sub_chunks {
150            let v = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
151            let ge_9 = _mm_cmpgt_epi8(v, min_ws);
152            let le_d = _mm_cmpgt_epi8(max_ws, v);
153            let in_range = _mm_and_si128(ge_9, le_d);
154            let is_sp = _mm_cmpeq_epi8(v, space);
155            let ws_vec = _mm_or_si128(in_range, is_sp);
156            let ws_mask = _mm_movemask_epi8(ws_vec) as u32;
157
158            let prev_mask = (ws_mask << 1) | prev_ws_u32;
159            let word_starts = prev_mask & (!ws_mask & 0xFFFF);
160            words += word_starts.count_ones() as u64;
161            prev_ws_u32 = (ws_mask >> 15) & 1;
162        }
163
164        // Scalar for final <16 bytes
165        let mut prev_ws = prev_ws_u32 as u8;
166        for &b in sub_remainder {
167            let curr_ws = WS_TABLE[b as usize];
168            words += (prev_ws & (curr_ws ^ 1)) as u64;
169            prev_ws = curr_ws;
170        }
171        words
172    }
173}
174
175/// Scalar word counting fallback for non-x86 platforms.
176/// Uses 64-byte block bitmask operations with table lookup + popcount.
177#[cfg(not(target_arch = "x86_64"))]
178fn count_words_scalar(data: &[u8]) -> u64 {
179    let mut words = 0u64;
180    let mut prev_ws_bit = 1u64;
181
182    let chunks = data.chunks_exact(64);
183    let remainder = chunks.remainder();
184
185    for chunk in chunks {
186        let mut ws_mask = 0u64;
187        let mut i = 0;
188        while i + 7 < 64 {
189            ws_mask |= (WS_TABLE[chunk[i] as usize] as u64) << i;
190            ws_mask |= (WS_TABLE[chunk[i + 1] as usize] as u64) << (i + 1);
191            ws_mask |= (WS_TABLE[chunk[i + 2] as usize] as u64) << (i + 2);
192            ws_mask |= (WS_TABLE[chunk[i + 3] as usize] as u64) << (i + 3);
193            ws_mask |= (WS_TABLE[chunk[i + 4] as usize] as u64) << (i + 4);
194            ws_mask |= (WS_TABLE[chunk[i + 5] as usize] as u64) << (i + 5);
195            ws_mask |= (WS_TABLE[chunk[i + 6] as usize] as u64) << (i + 6);
196            ws_mask |= (WS_TABLE[chunk[i + 7] as usize] as u64) << (i + 7);
197            i += 8;
198        }
199
200        let prev_mask = (ws_mask << 1) | prev_ws_bit;
201        let word_starts = prev_mask & !ws_mask;
202        words += word_starts.count_ones() as u64;
203        prev_ws_bit = (ws_mask >> 63) & 1;
204    }
205
206    let mut prev_ws = prev_ws_bit as u8;
207    for &b in remainder {
208        let curr_ws = WS_TABLE[b as usize];
209        words += (prev_ws & (curr_ws ^ 1)) as u64;
210        prev_ws = curr_ws;
211    }
212    words
213}
214
215/// Count lines and words in a single pass using 64-byte bitmask blocks.
216///
217/// Eliminates the separate memchr line-counting pass by piggybacking newline
218/// counting onto the whitespace bitmask already computed for word counting.
219/// For each 64-byte block, we build both a whitespace mask and a newline mask,
220/// then use popcount on each.
221pub fn count_lines_words(data: &[u8]) -> (u64, u64) {
222    let mut words = 0u64;
223    let mut lines = 0u64;
224    let mut prev_ws_bit = 1u64;
225
226    let chunks = data.chunks_exact(64);
227    let remainder = chunks.remainder();
228
229    for chunk in chunks {
230        let mut ws_mask = 0u64;
231        let mut nl_mask = 0u64;
232        let mut i = 0;
233        while i + 7 < 64 {
234            let b0 = chunk[i];
235            let b1 = chunk[i + 1];
236            let b2 = chunk[i + 2];
237            let b3 = chunk[i + 3];
238            let b4 = chunk[i + 4];
239            let b5 = chunk[i + 5];
240            let b6 = chunk[i + 6];
241            let b7 = chunk[i + 7];
242            ws_mask |= (WS_TABLE[b0 as usize] as u64) << i;
243            ws_mask |= (WS_TABLE[b1 as usize] as u64) << (i + 1);
244            ws_mask |= (WS_TABLE[b2 as usize] as u64) << (i + 2);
245            ws_mask |= (WS_TABLE[b3 as usize] as u64) << (i + 3);
246            ws_mask |= (WS_TABLE[b4 as usize] as u64) << (i + 4);
247            ws_mask |= (WS_TABLE[b5 as usize] as u64) << (i + 5);
248            ws_mask |= (WS_TABLE[b6 as usize] as u64) << (i + 6);
249            ws_mask |= (WS_TABLE[b7 as usize] as u64) << (i + 7);
250            nl_mask |= ((b0 == b'\n') as u64) << i;
251            nl_mask |= ((b1 == b'\n') as u64) << (i + 1);
252            nl_mask |= ((b2 == b'\n') as u64) << (i + 2);
253            nl_mask |= ((b3 == b'\n') as u64) << (i + 3);
254            nl_mask |= ((b4 == b'\n') as u64) << (i + 4);
255            nl_mask |= ((b5 == b'\n') as u64) << (i + 5);
256            nl_mask |= ((b6 == b'\n') as u64) << (i + 6);
257            nl_mask |= ((b7 == b'\n') as u64) << (i + 7);
258            i += 8;
259        }
260
261        let prev_mask = (ws_mask << 1) | prev_ws_bit;
262        let word_starts = prev_mask & !ws_mask;
263        words += word_starts.count_ones() as u64;
264        lines += nl_mask.count_ones() as u64;
265        prev_ws_bit = (ws_mask >> 63) & 1;
266    }
267
268    let mut prev_ws = prev_ws_bit as u8;
269    for &b in remainder {
270        if b == b'\n' {
271            lines += 1;
272        }
273        let curr_ws = WS_TABLE[b as usize];
274        words += (prev_ws & (curr_ws ^ 1)) as u64;
275        prev_ws = curr_ws;
276    }
277    (lines, words)
278}
279
280/// Count lines, words, and UTF-8 chars in a single pass using 64-byte bitmask blocks.
281///
282/// Builds whitespace, newline, and non-continuation-byte masks simultaneously
283/// in one read of the data, then uses popcount on each mask. This is the most
284/// memory-efficient approach when all three metrics are needed, as it touches
285/// each cache line only once.
286pub fn count_lines_words_chars_utf8(data: &[u8]) -> (u64, u64, u64) {
287    let mut words = 0u64;
288    let mut lines = 0u64;
289    let mut chars = 0u64;
290    let mut prev_ws_bit = 1u64;
291
292    let chunks = data.chunks_exact(64);
293    let remainder = chunks.remainder();
294
295    for chunk in chunks {
296        let mut ws_mask = 0u64;
297        let mut nl_mask = 0u64;
298        let mut char_mask = 0u64;
299        let mut i = 0;
300        while i + 7 < 64 {
301            let b0 = chunk[i];
302            let b1 = chunk[i + 1];
303            let b2 = chunk[i + 2];
304            let b3 = chunk[i + 3];
305            let b4 = chunk[i + 4];
306            let b5 = chunk[i + 5];
307            let b6 = chunk[i + 6];
308            let b7 = chunk[i + 7];
309
310            ws_mask |= (WS_TABLE[b0 as usize] as u64) << i
311                | (WS_TABLE[b1 as usize] as u64) << (i + 1)
312                | (WS_TABLE[b2 as usize] as u64) << (i + 2)
313                | (WS_TABLE[b3 as usize] as u64) << (i + 3)
314                | (WS_TABLE[b4 as usize] as u64) << (i + 4)
315                | (WS_TABLE[b5 as usize] as u64) << (i + 5)
316                | (WS_TABLE[b6 as usize] as u64) << (i + 6)
317                | (WS_TABLE[b7 as usize] as u64) << (i + 7);
318
319            nl_mask |= ((b0 == b'\n') as u64) << i
320                | ((b1 == b'\n') as u64) << (i + 1)
321                | ((b2 == b'\n') as u64) << (i + 2)
322                | ((b3 == b'\n') as u64) << (i + 3)
323                | ((b4 == b'\n') as u64) << (i + 4)
324                | ((b5 == b'\n') as u64) << (i + 5)
325                | ((b6 == b'\n') as u64) << (i + 6)
326                | ((b7 == b'\n') as u64) << (i + 7);
327
328            char_mask |= (((b0 & 0xC0) != 0x80) as u64) << i
329                | (((b1 & 0xC0) != 0x80) as u64) << (i + 1)
330                | (((b2 & 0xC0) != 0x80) as u64) << (i + 2)
331                | (((b3 & 0xC0) != 0x80) as u64) << (i + 3)
332                | (((b4 & 0xC0) != 0x80) as u64) << (i + 4)
333                | (((b5 & 0xC0) != 0x80) as u64) << (i + 5)
334                | (((b6 & 0xC0) != 0x80) as u64) << (i + 6)
335                | (((b7 & 0xC0) != 0x80) as u64) << (i + 7);
336
337            i += 8;
338        }
339        let prev_mask = (ws_mask << 1) | prev_ws_bit;
340        let word_starts = prev_mask & !ws_mask;
341        words += word_starts.count_ones() as u64;
342        lines += nl_mask.count_ones() as u64;
343        chars += char_mask.count_ones() as u64;
344        prev_ws_bit = (ws_mask >> 63) & 1;
345    }
346
347    let mut prev_ws = prev_ws_bit as u8;
348    for &b in remainder {
349        if b == b'\n' {
350            lines += 1;
351        }
352        let curr_ws = WS_TABLE[b as usize];
353        words += (prev_ws & (curr_ws ^ 1)) as u64;
354        prev_ws = curr_ws;
355        chars += ((b & 0xC0) != 0x80) as u64;
356    }
357    (lines, words, chars)
358}
359
360/// Count UTF-8 characters by counting non-continuation bytes.
361/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
362/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
363///
364/// Uses 64-byte block processing with popcount for ~4x throughput vs scalar.
365pub fn count_chars_utf8(data: &[u8]) -> u64 {
366    let mut count = 0u64;
367    let chunks = data.chunks_exact(64);
368    let remainder = chunks.remainder();
369
370    for chunk in chunks {
371        // Build 64-bit mask: bit i = 1 if chunk[i] is NOT a continuation byte
372        let mut char_mask = 0u64;
373        let mut i = 0;
374        while i + 7 < 64 {
375            char_mask |= (((chunk[i] & 0xC0) != 0x80) as u64) << i;
376            char_mask |= (((chunk[i + 1] & 0xC0) != 0x80) as u64) << (i + 1);
377            char_mask |= (((chunk[i + 2] & 0xC0) != 0x80) as u64) << (i + 2);
378            char_mask |= (((chunk[i + 3] & 0xC0) != 0x80) as u64) << (i + 3);
379            char_mask |= (((chunk[i + 4] & 0xC0) != 0x80) as u64) << (i + 4);
380            char_mask |= (((chunk[i + 5] & 0xC0) != 0x80) as u64) << (i + 5);
381            char_mask |= (((chunk[i + 6] & 0xC0) != 0x80) as u64) << (i + 6);
382            char_mask |= (((chunk[i + 7] & 0xC0) != 0x80) as u64) << (i + 7);
383            i += 8;
384        }
385        count += char_mask.count_ones() as u64;
386    }
387
388    for &b in remainder {
389        count += ((b & 0xC0) != 0x80) as u64;
390    }
391    count
392}
393
394/// Count characters in C/POSIX locale (each byte is one character).
395#[inline]
396pub fn count_chars_c(data: &[u8]) -> u64 {
397    data.len() as u64
398}
399
400/// Count characters, choosing behavior based on locale.
401#[inline]
402pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
403    if utf8 {
404        count_chars_utf8(data)
405    } else {
406        count_chars_c(data)
407    }
408}
409
410/// Detect if the current locale uses UTF-8 encoding.
411pub fn is_utf8_locale() -> bool {
412    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
413        if let Ok(val) = std::env::var(var) {
414            if !val.is_empty() {
415                let lower = val.to_ascii_lowercase();
416                return lower.contains("utf-8") || lower.contains("utf8");
417            }
418        }
419    }
420    false
421}
422
423/// Decode one UTF-8 character from a byte slice.
424/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
425#[inline]
426fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
427    let b0 = bytes[0];
428    if b0 < 0x80 {
429        return (b0 as u32, 1);
430    }
431    if b0 < 0xC2 {
432        // Continuation byte or overlong 2-byte — invalid as start
433        return (b0 as u32, 1);
434    }
435    if b0 < 0xE0 {
436        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
437            return (b0 as u32, 1);
438        }
439        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
440        return (cp, 2);
441    }
442    if b0 < 0xF0 {
443        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
444            return (b0 as u32, 1);
445        }
446        let cp =
447            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
448        return (cp, 3);
449    }
450    if b0 < 0xF5 {
451        if bytes.len() < 4
452            || bytes[1] & 0xC0 != 0x80
453            || bytes[2] & 0xC0 != 0x80
454            || bytes[3] & 0xC0 != 0x80
455        {
456            return (b0 as u32, 1);
457        }
458        let cp = ((b0 as u32 & 0x07) << 18)
459            | ((bytes[1] as u32 & 0x3F) << 12)
460            | ((bytes[2] as u32 & 0x3F) << 6)
461            | (bytes[3] as u32 & 0x3F);
462        return (cp, 4);
463    }
464    (b0 as u32, 1)
465}
466
467/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
468#[inline]
469fn is_wide_char(cp: u32) -> bool {
470    matches!(cp,
471        0x1100..=0x115F   // Hangul Jamo
472        | 0x231A..=0x231B // Watch, Hourglass
473        | 0x2329..=0x232A // Angle Brackets
474        | 0x23E9..=0x23F3 // Various symbols
475        | 0x23F8..=0x23FA
476        | 0x25FD..=0x25FE
477        | 0x2614..=0x2615
478        | 0x2648..=0x2653
479        | 0x267F
480        | 0x2693
481        | 0x26A1
482        | 0x26AA..=0x26AB
483        | 0x26BD..=0x26BE
484        | 0x26C4..=0x26C5
485        | 0x26CE
486        | 0x26D4
487        | 0x26EA
488        | 0x26F2..=0x26F3
489        | 0x26F5
490        | 0x26FA
491        | 0x26FD
492        | 0x2702
493        | 0x2705
494        | 0x2708..=0x270D
495        | 0x270F
496        | 0x2712
497        | 0x2714
498        | 0x2716
499        | 0x271D
500        | 0x2721
501        | 0x2728
502        | 0x2733..=0x2734
503        | 0x2744
504        | 0x2747
505        | 0x274C
506        | 0x274E
507        | 0x2753..=0x2755
508        | 0x2757
509        | 0x2763..=0x2764
510        | 0x2795..=0x2797
511        | 0x27A1
512        | 0x27B0
513        | 0x27BF
514        | 0x2934..=0x2935
515        | 0x2B05..=0x2B07
516        | 0x2B1B..=0x2B1C
517        | 0x2B50
518        | 0x2B55
519        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
520        | 0x3041..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
521        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
522        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
523        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
524        | 0xAC00..=0xD7A3  // Hangul Syllables
525        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
526        | 0xFE10..=0xFE19  // Vertical Forms
527        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
528        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
529        | 0xFFE0..=0xFFE6  // Fullwidth Signs
530        | 0x1F004
531        | 0x1F0CF
532        | 0x1F170..=0x1F171
533        | 0x1F17E..=0x1F17F
534        | 0x1F18E
535        | 0x1F191..=0x1F19A
536        | 0x1F1E0..=0x1F1FF // Regional Indicators
537        | 0x1F200..=0x1F202
538        | 0x1F210..=0x1F23B
539        | 0x1F240..=0x1F248
540        | 0x1F250..=0x1F251
541        | 0x1F260..=0x1F265
542        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
543        | 0x1F680..=0x1F6FF // Transport Symbols
544        | 0x1F900..=0x1F9FF // Supplemental Symbols
545        | 0x1FA00..=0x1FA6F
546        | 0x1FA70..=0x1FAFF
547        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
548        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
549    )
550}
551
552/// Compute maximum display width of any line (C/POSIX locale).
553///
554/// GNU wc -L behavior in C locale:
555/// - `\n`: line terminator (records max, resets position)
556/// - `\t`: advances to next tab stop (multiple of 8)
557/// - `\r`: carriage return (resets position to 0, same line)
558/// - `\f`: form feed (acts as line terminator like \n)
559/// - Printable ASCII (0x20..0x7E): width 1
560/// - Everything else (controls, high bytes): width 0
561pub fn max_line_length_c(data: &[u8]) -> u64 {
562    let mut max_len: u64 = 0;
563    let mut line_len: u64 = 0; // max position seen on current line
564    let mut linepos: u64 = 0; // current cursor position
565
566    for &b in data {
567        match b {
568            b'\n' => {
569                if line_len > max_len {
570                    max_len = line_len;
571                }
572                linepos = 0;
573                line_len = 0;
574            }
575            b'\t' => {
576                linepos = (linepos + 8) & !7;
577                if linepos > line_len {
578                    line_len = linepos;
579                }
580            }
581            b'\r' => {
582                linepos = 0;
583            }
584            0x0C => {
585                // Form feed: acts as line terminator
586                if line_len > max_len {
587                    max_len = line_len;
588                }
589                linepos = 0;
590                line_len = 0;
591            }
592            _ => {
593                if PRINTABLE_TABLE[b as usize] != 0 {
594                    linepos += 1;
595                    if linepos > line_len {
596                        line_len = linepos;
597                    }
598                }
599                // Non-printable: width 0
600            }
601        }
602    }
603
604    // Handle last line (may not end with \n)
605    if line_len > max_len {
606        max_len = line_len;
607    }
608
609    max_len
610}
611
612/// Compute maximum display width of any line (UTF-8 locale).
613///
614/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
615/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
616pub fn max_line_length_utf8(data: &[u8]) -> u64 {
617    let mut max_len: u64 = 0;
618    let mut line_len: u64 = 0;
619    let mut linepos: u64 = 0;
620    let mut i = 0;
621
622    while i < data.len() {
623        let b = data[i];
624
625        // Fast path for common ASCII
626        if b < 0x80 {
627            match b {
628                b'\n' => {
629                    if line_len > max_len {
630                        max_len = line_len;
631                    }
632                    linepos = 0;
633                    line_len = 0;
634                }
635                b'\t' => {
636                    linepos = (linepos + 8) & !7;
637                    if linepos > line_len {
638                        line_len = linepos;
639                    }
640                }
641                b'\r' => {
642                    linepos = 0;
643                }
644                0x0C => {
645                    // Form feed: line terminator
646                    if line_len > max_len {
647                        max_len = line_len;
648                    }
649                    linepos = 0;
650                    line_len = 0;
651                }
652                0x20..=0x7E => {
653                    // Printable ASCII
654                    linepos += 1;
655                    if linepos > line_len {
656                        line_len = linepos;
657                    }
658                }
659                _ => {
660                    // Non-printable ASCII control chars: width 0
661                }
662            }
663            i += 1;
664        } else {
665            // Multibyte UTF-8
666            let (cp, len) = decode_utf8(&data[i..]);
667
668            // C1 control characters (0x80..0x9F): non-printable
669            if cp <= 0x9F {
670                // width 0
671            } else if is_wide_char(cp) {
672                linepos += 2;
673                if linepos > line_len {
674                    line_len = linepos;
675                }
676            } else {
677                // Regular printable Unicode character: width 1
678                linepos += 1;
679                if linepos > line_len {
680                    line_len = linepos;
681                }
682            }
683            i += len;
684        }
685    }
686
687    // Handle last line
688    if line_len > max_len {
689        max_len = line_len;
690    }
691
692    max_len
693}
694
695/// Compute maximum display width, choosing behavior based on locale.
696#[inline]
697pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
698    if utf8 {
699        max_line_length_utf8(data)
700    } else {
701        max_line_length_c(data)
702    }
703}
704
705/// Count all metrics using optimized individual passes.
706///
707/// Each metric uses its own optimized algorithm:
708/// - Lines: SIMD-accelerated memchr
709/// - Words: branchless lookup table
710/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
711/// - Max line length: locale-aware display width tracking
712///
713/// Multi-pass is faster than single-pass because each pass has a tight,
714/// specialized loop. After the first pass, data is hot in L2/L3 cache,
715/// making subsequent passes nearly free for memory bandwidth.
716pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
717    WcCounts {
718        lines: count_lines(data),
719        words: count_words(data),
720        bytes: data.len() as u64,
721        chars: count_chars(data, utf8),
722        max_line_length: max_line_length(data, utf8),
723    }
724}
725
726// ──────────────────────────────────────────────────
727// Parallel counting for large files
728// ──────────────────────────────────────────────────
729
730/// Count newlines in parallel using SIMD memchr + rayon.
731pub fn count_lines_parallel(data: &[u8]) -> u64 {
732    if data.len() < PARALLEL_THRESHOLD {
733        return count_lines(data);
734    }
735
736    let num_threads = rayon::current_num_threads().max(1);
737    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
738
739    data.par_chunks(chunk_size)
740        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
741        .sum()
742}
743
744/// Count words in parallel with boundary adjustment.
745///
746/// Each chunk is counted independently, then boundaries are checked:
747/// if chunk N ends with non-whitespace and chunk N+1 starts with non-whitespace,
748/// a word was split across the boundary and double-counted — subtract 1.
749pub fn count_words_parallel(data: &[u8]) -> u64 {
750    if data.len() < PARALLEL_THRESHOLD {
751        return count_words(data);
752    }
753
754    let num_threads = rayon::current_num_threads().max(1);
755    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
756
757    let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
758
759    // Each chunk produces: (word_count, first_byte_is_non_ws, last_byte_is_non_ws)
760    let results: Vec<(u64, bool, bool)> = chunks
761        .par_iter()
762        .map(|chunk| {
763            let words = count_words(chunk);
764            let starts_non_ws = chunk.first().is_some_and(|&b| WS_TABLE[b as usize] == 0);
765            let ends_non_ws = chunk.last().is_some_and(|&b| WS_TABLE[b as usize] == 0);
766            (words, starts_non_ws, ends_non_ws)
767        })
768        .collect();
769
770    let mut total = 0u64;
771    for i in 0..results.len() {
772        total += results[i].0;
773        // If previous chunk ends with non-ws and this chunk starts with non-ws,
774        // a word spans the boundary and was counted as two separate words.
775        if i > 0 && results[i].1 && results[i - 1].2 {
776            total -= 1;
777        }
778    }
779    total
780}
781
782/// Count UTF-8 characters in parallel.
783pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
784    if !utf8 {
785        return data.len() as u64;
786    }
787    if data.len() < PARALLEL_THRESHOLD {
788        return count_chars_utf8(data);
789    }
790
791    let num_threads = rayon::current_num_threads().max(1);
792    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
793
794    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
795}
796
797/// Partial result from processing a chunk, used to combine results at boundaries.
798struct ChunkResult {
799    lines: u64,
800    words: u64,
801    chars: u64,
802    starts_non_ws: bool,
803    ends_non_ws: bool,
804}
805
806/// Combined parallel counting of lines + words + chars.
807///
808/// Uses SIMD-accelerated memchr for line counting plus optimized bitmask word
809/// counting per chunk, with data staying cache-warm between passes.
810/// On compute-bound CPUs (like Atom), the two-pass approach is faster than a
811/// combined scalar single-pass because memchr uses SIMD (16+ bytes/cycle)
812/// while the combined loop adds non-SIMD overhead to every byte.
813pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
814    if data.len() < PARALLEL_THRESHOLD {
815        // Small files: SIMD memchr + scalar word counting, sequential
816        let lines = count_lines(data);
817        let words = count_words(data);
818        let chars = count_chars(data, utf8);
819        return (lines, words, chars);
820    }
821
822    let num_threads = rayon::current_num_threads().max(1);
823    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
824
825    let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
826
827    let results: Vec<ChunkResult> = chunks
828        .par_iter()
829        .map(|chunk| {
830            // Pass 1: SIMD-accelerated line counting (loads data into cache)
831            let lines = memchr_iter(b'\n', chunk).count() as u64;
832            // Pass 2: Bitmask word counting (data now cache-warm from pass 1)
833            let words = count_words(chunk);
834            // Pass 3: Char counting (fast with warm cache, or free for non-UTF8)
835            let chars = if utf8 {
836                count_chars_utf8(chunk)
837            } else {
838                chunk.len() as u64
839            };
840            let starts_non_ws = chunk.first().is_some_and(|&b| WS_TABLE[b as usize] == 0);
841            let ends_non_ws = chunk.last().is_some_and(|&b| WS_TABLE[b as usize] == 0);
842            ChunkResult {
843                lines,
844                words,
845                chars,
846                starts_non_ws,
847                ends_non_ws,
848            }
849        })
850        .collect();
851
852    let mut total_lines = 0u64;
853    let mut total_words = 0u64;
854    let mut total_chars = 0u64;
855    for i in 0..results.len() {
856        total_lines += results[i].lines;
857        total_words += results[i].words;
858        total_chars += results[i].chars;
859        if i > 0 && results[i].starts_non_ws && results[i - 1].ends_non_ws {
860            total_words -= 1;
861        }
862    }
863    (total_lines, total_words, total_chars)
864}