Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (1MB).
5/// Rayon overhead is ~5-10μs per task; at 1MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~100μs, so overhead is < 10%.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// Byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// C locale (GNU wc 9.4): 3-state model using isprint() as gatekeeper.
24//   0 = transparent: non-printable, non-space bytes (NUL, controls, DEL, 0x80-0xFF)
25//   1 = word-break: whitespace (0x09-0x0D, 0x20)
26//   2 = word content: printable ASCII (0x21-0x7E)
27// Transparent bytes don't start, continue, or break words.
28//
29// UTF-8 locale: 3-state model (matching GNU wc 9.4 mbrtowc behavior).
30// ASCII uses same BYTE_CLASS_C table (controls transparent, spaces break, printable content).
31// Multi-byte Unicode spaces are detected via codepoint lookup.
32// Invalid UTF-8 encoding errors are transparent (GNU mbrtowc skips on error).
33
34/// Byte classification for C/POSIX locale word counting (matching GNU wc 9.4).
35/// GNU wc uses isprint() as a gatekeeper: only printable bytes (0x21-0x7E) can
36/// start or continue words. Non-printable, non-space bytes are "transparent" —
37/// they don't start words, don't continue words, and don't break words.
38///   0 = transparent (non-printable, non-space): NUL, controls, DEL, high bytes
39///   1 = word-break (whitespace): tab, newline, VT, FF, CR, space
40///   2 = word content (printable): 0x21-0x7E
41const fn make_byte_class_c() -> [u8; 256] {
42    let mut t = [0u8; 256]; // default: transparent
43    // C locale isspace() chars → word-break
44    t[0x09] = 1; // tab
45    t[0x0A] = 1; // newline
46    t[0x0B] = 1; // vertical tab
47    t[0x0C] = 1; // form feed
48    t[0x0D] = 1; // carriage return
49    t[0x20] = 1; // space
50    // C locale isprint() chars (excluding space) → word content
51    let mut b = 0x21u16;
52    while b <= 0x7E {
53        t[b as usize] = 2;
54        b += 1;
55    }
56    t
57}
58const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
59
60/// For parallel chunk merging: determine if a chunk effectively starts with
61/// word content by scanning past leading transparent bytes. This is needed
62/// because transparent bytes don't break words, so a chunk starting with
63/// [\x00, \x80, 'a'] should be treated as starting with word content.
64#[inline]
65pub(crate) fn first_is_word_c(data: &[u8]) -> bool {
66    for &b in data {
67        let class = BYTE_CLASS_C[b as usize];
68        if class != 0 {
69            return class == 2;
70        }
71    }
72    false // all transparent
73}
74
75// ──────────────────────────────────────────────────
76// Unicode character classification helpers
77// ──────────────────────────────────────────────────
78
79/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
80/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
81#[inline]
82fn is_unicode_space(cp: u32) -> bool {
83    matches!(
84        cp,
85        0x00A0 |           // No-Break Space
86        0x1680 |           // Ogham Space Mark
87        0x2000
88            ..=0x200A |  // En Quad through Hair Space
89        0x2028 |           // Line Separator
90        0x2029 |           // Paragraph Separator
91        0x202F |           // Narrow No-Break Space
92        0x205F |           // Medium Mathematical Space
93        0x3000 // Ideographic Space
94    )
95}
96
97/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
98/// C1 control characters (U+0080-U+009F) are not printable.
99// ──────────────────────────────────────────────────
100// Core counting functions
101// ──────────────────────────────────────────────────
102
103/// Count newlines using SIMD-accelerated memchr.
104/// GNU wc counts newline bytes (`\n`), not logical lines.
105#[inline]
106pub fn count_lines(data: &[u8]) -> u64 {
107    memchr_iter(b'\n', data).count() as u64
108}
109
110/// Count bytes. Trivial but included for API consistency.
111#[inline]
112pub fn count_bytes(data: &[u8]) -> u64 {
113    data.len() as u64
114}
115
116/// Count words using locale-aware 3-state logic (default: UTF-8).
117pub fn count_words(data: &[u8]) -> u64 {
118    count_words_locale(data, true)
119}
120
121/// Count words with explicit locale control using 2-state logic.
122///
123/// GNU wc classifies each byte/character as:
124///   - space (whitespace): sets in_word=false
125///   - word content (everything else): sets in_word=true, increments word count on transition
126pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
127    if utf8 {
128        count_words_utf8(data)
129    } else {
130        count_words_c(data)
131    }
132}
133
134/// Count words in C/POSIX locale using 3-state logic matching GNU wc 9.4.
135/// Bytes are classified into 3 categories:
136///   - Word-break (class 1): 0x09-0x0D, 0x20 — ends current word
137///   - Word content (class 2): 0x21-0x7E — starts or continues a word
138///   - Transparent (class 0): everything else — no effect on word state
139fn count_words_c(data: &[u8]) -> u64 {
140    let mut words = 0u64;
141    let mut in_word = false;
142    let mut i = 0;
143    let len = data.len();
144
145    while i < len {
146        let b = unsafe { *data.get_unchecked(i) };
147        let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
148        if class == 1 {
149            // Word-break (whitespace) — end current word
150            in_word = false;
151        } else if class == 2 {
152            // Word content (printable) — start or continue word
153            if !in_word {
154                in_word = true;
155                words += 1;
156            }
157        }
158        // class == 0: transparent — no state change
159        i += 1;
160    }
161    words
162}
163
164/// Scalar tail for SIMD line+word counters: processes remaining bytes after
165/// the SIMD loop and returns final counts with boundary info.
166/// SAFETY: caller must ensure ptr is valid for [0..len) and i <= len.
167#[cfg(target_arch = "x86_64")]
168#[inline(always)]
169fn count_lw_c_scalar_tail(
170    ptr: *const u8,
171    mut i: usize,
172    len: usize,
173    mut total_lines: u64,
174    mut total_words: u64,
175    mut prev_in_word: bool,
176    data: &[u8],
177) -> (u64, u64, bool, bool) {
178    while i < len {
179        let b = unsafe { *ptr.add(i) };
180        let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
181        if class == 1 {
182            // Word-break
183            if b == b'\n' {
184                total_lines += 1;
185            }
186            prev_in_word = false;
187        } else if class == 2 && !prev_in_word {
188            // Word content (printable) — transition into word
189            total_words += 1;
190            prev_in_word = true;
191        }
192        // class == 0: transparent — no state change
193        i += 1;
194    }
195    let first_is_word = first_is_word_c(data);
196    (total_lines, total_words, first_is_word, prev_in_word)
197}
198
199/// AVX2-accelerated fused line+word counter for C locale chunks.
200/// Processes 32 bytes per iteration using 3-state logic matching GNU wc 9.4:
201///   - Word-break: {0x09-0x0D, 0x20} (6 bytes) — ends word
202///   - Word content: {0x21-0x7E} (printable ASCII) — starts/continues word
203///   - Transparent: everything else — no effect on word state
204/// Word transitions detected via bitmask on printable bytes only.
205#[cfg(target_arch = "x86_64")]
206#[target_feature(enable = "avx2")]
207unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
208    use std::arch::x86_64::*;
209
210    let len = data.len();
211    let ptr = data.as_ptr();
212    let mut i = 0usize;
213    let mut total_lines = 0u64;
214    let mut total_words = 0u64;
215    let mut prev_in_word = false;
216
217    unsafe {
218        let nl_byte = _mm256_set1_epi8(b'\n' as i8);
219        let zero = _mm256_setzero_si256();
220        let ones = _mm256_set1_epi8(1);
221        // Printable range detection: 0x21-0x7E
222        let const_0x21 = _mm256_set1_epi8(0x21u8 as i8);
223        let const_0x7e = _mm256_set1_epi8(0x7Eu8 as i8);
224        // Word-break detection: {0x09-0x0D, 0x20}
225        let const_0x09 = _mm256_set1_epi8(0x09u8 as i8);
226        let const_0x0d = _mm256_set1_epi8(0x0Du8 as i8);
227        let const_0x20 = _mm256_set1_epi8(0x20u8 as i8);
228
229        let mut line_acc = _mm256_setzero_si256();
230        let mut batch = 0u32;
231
232        while i + 32 <= len {
233            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
234            let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
235            line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
236
237            // Printable check: byte in [0x21, 0x7E] (word content)
238            let ge_21 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x21), v);
239            let le_7e = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x7e), v);
240            let is_printable = _mm256_and_si256(ge_21, le_7e);
241            let word_mask = _mm256_movemask_epi8(is_printable) as u32;
242
243            // Word-break check: byte in {0x09-0x0D, 0x20}
244            let ge_09 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x09), v);
245            let le_0d = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x0d), v);
246            let in_tab_range = _mm256_and_si256(ge_09, le_0d);
247            let is_space = _mm256_cmpeq_epi8(v, const_0x20);
248            let is_break = _mm256_or_si256(in_tab_range, is_space);
249            let break_mask = _mm256_movemask_epi8(is_break) as u32;
250
251            // 3-state segmented carry-propagation: compute word starts in O(1).
252            // Uses a segmented parallel prefix scan so carry cannot bridge across
253            // break positions. At each doubling step, the pass mask narrows to
254            // require ALL intermediate positions to be transparent.
255            let transparent = !break_mask & !word_mask;
256            let mut carry = word_mask | if prev_in_word { 1u32 } else { 0u32 };
257            carry &= !break_mask;
258            let mut pass = transparent;
259            carry |= (carry << 1) & pass;
260            pass &= pass << 1;
261            carry |= (carry << 2) & pass;
262            pass &= pass << 2;
263            carry |= (carry << 4) & pass;
264            pass &= pass << 4;
265            carry |= (carry << 8) & pass;
266            pass &= pass << 8;
267            carry |= (carry << 16) & pass;
268            let prev_carry = (carry << 1) | if prev_in_word { 1u32 } else { 0u32 };
269            let starts = word_mask & !prev_carry;
270            total_words += starts.count_ones() as u64;
271            prev_in_word = (carry >> 31) & 1 == 1;
272
273            batch += 1;
274            if batch >= 255 {
275                let sad = _mm256_sad_epu8(line_acc, zero);
276                let hi = _mm256_extracti128_si256(sad, 1);
277                let lo = _mm256_castsi256_si128(sad);
278                let s = _mm_add_epi64(lo, hi);
279                let h64 = _mm_unpackhi_epi64(s, s);
280                let t = _mm_add_epi64(s, h64);
281                total_lines += _mm_cvtsi128_si64(t) as u64;
282                line_acc = _mm256_setzero_si256();
283                batch = 0;
284            }
285            i += 32;
286        }
287
288        if batch > 0 {
289            let sad = _mm256_sad_epu8(line_acc, zero);
290            let hi = _mm256_extracti128_si256(sad, 1);
291            let lo = _mm256_castsi256_si128(sad);
292            let s = _mm_add_epi64(lo, hi);
293            let h64 = _mm_unpackhi_epi64(s, s);
294            let t = _mm_add_epi64(s, h64);
295            total_lines += _mm_cvtsi128_si64(t) as u64;
296        }
297    }
298
299    count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
300}
301
302/// SSE2 variant of count_lw_c_chunk_avx2 — processes 16 bytes per iteration.
303/// See AVX2 function above for algorithm details.
304#[cfg(target_arch = "x86_64")]
305#[target_feature(enable = "sse2")]
306unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
307    use std::arch::x86_64::*;
308
309    let len = data.len();
310    let ptr = data.as_ptr();
311    let mut i = 0usize;
312    let mut total_lines = 0u64;
313    let mut total_words = 0u64;
314    let mut prev_in_word = false;
315
316    unsafe {
317        let nl_byte = _mm_set1_epi8(b'\n' as i8);
318        let zero = _mm_setzero_si128();
319        let ones = _mm_set1_epi8(1);
320        // Printable range detection: 0x21-0x7E
321        let const_0x21 = _mm_set1_epi8(0x21u8 as i8);
322        let const_0x7e = _mm_set1_epi8(0x7Eu8 as i8);
323        // Word-break detection: {0x09-0x0D, 0x20}
324        let const_0x09 = _mm_set1_epi8(0x09u8 as i8);
325        let const_0x0d = _mm_set1_epi8(0x0Du8 as i8);
326        let const_0x20 = _mm_set1_epi8(0x20u8 as i8);
327
328        let mut line_acc = _mm_setzero_si128();
329        let mut batch = 0u32;
330
331        while i + 16 <= len {
332            let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
333            let is_nl = _mm_cmpeq_epi8(v, nl_byte);
334            line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
335
336            // Printable check: byte in [0x21, 0x7E] (word content)
337            let ge_21 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x21), v);
338            let le_7e = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x7e), v);
339            let is_printable = _mm_and_si128(ge_21, le_7e);
340            let word_mask = (_mm_movemask_epi8(is_printable) as u32) & 0xFFFF;
341
342            // Word-break check: byte in {0x09-0x0D, 0x20}
343            let ge_09 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x09), v);
344            let le_0d = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x0d), v);
345            let in_tab_range = _mm_and_si128(ge_09, le_0d);
346            let is_space = _mm_cmpeq_epi8(v, const_0x20);
347            let is_break = _mm_or_si128(in_tab_range, is_space);
348            let break_mask = (_mm_movemask_epi8(is_break) as u32) & 0xFFFF;
349
350            // 3-state segmented carry-propagation (16-bit, 4 doubling steps)
351            let transparent = !break_mask & !word_mask & 0xFFFF;
352            let mut carry = (word_mask | if prev_in_word { 1u32 } else { 0u32 }) & 0xFFFF;
353            carry &= !break_mask;
354            let mut pass = transparent;
355            carry |= (carry << 1) & pass;
356            pass &= pass << 1;
357            carry |= (carry << 2) & pass;
358            pass &= pass << 2;
359            carry |= (carry << 4) & pass;
360            pass &= pass << 4;
361            carry |= (carry << 8) & pass;
362            let prev_carry = ((carry << 1) | if prev_in_word { 1u32 } else { 0u32 }) & 0xFFFF;
363            let starts = word_mask & !prev_carry & 0xFFFF;
364            total_words += starts.count_ones() as u64;
365            prev_in_word = (carry >> 15) & 1 == 1;
366
367            batch += 1;
368            if batch >= 255 {
369                let sad = _mm_sad_epu8(line_acc, zero);
370                let hi = _mm_unpackhi_epi64(sad, sad);
371                let t = _mm_add_epi64(sad, hi);
372                total_lines += _mm_cvtsi128_si64(t) as u64;
373                line_acc = _mm_setzero_si128();
374                batch = 0;
375            }
376            i += 16;
377        }
378
379        if batch > 0 {
380            let sad = _mm_sad_epu8(line_acc, zero);
381            let hi = _mm_unpackhi_epi64(sad, sad);
382            let t = _mm_add_epi64(sad, hi);
383            total_lines += _mm_cvtsi128_si64(t) as u64;
384        }
385    }
386
387    count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
388}
389
390/// Dispatch to AVX2, SSE2, or scalar chunk counter.
391#[inline]
392fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
393    #[cfg(target_arch = "x86_64")]
394    {
395        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
396            return unsafe { count_lw_c_chunk_avx2(data) };
397        }
398        if data.len() >= 32 {
399            return unsafe { count_lw_c_chunk_sse2(data) };
400        }
401    }
402    count_lw_c_chunk(data)
403}
404
405/// Count words + lines in a C locale chunk using 3-state logic, returning
406/// counts plus boundary info for parallel chunk merging.
407/// Returns (line_count, word_count, first_is_word_content, ends_in_word).
408/// Word-break: 0x09-0x0D, 0x20. Word content: 0x21-0x7E. Transparent: rest.
409fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
410    let mut lines = 0u64;
411    let mut words = 0u64;
412    let mut in_word = false;
413    let mut i = 0;
414    let len = data.len();
415
416    // Determine first byte's classification for boundary merging
417    let first_is_word = first_is_word_c(data);
418
419    while i < len {
420        let b = unsafe { *data.get_unchecked(i) };
421        let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
422        if class == 1 {
423            // Word-break
424            if b == b'\n' {
425                lines += 1;
426            }
427            in_word = false;
428        } else if class == 2 && !in_word {
429            // Word content (printable) — start word
430            in_word = true;
431            words += 1;
432        }
433        // class == 0: transparent — no state change
434        i += 1;
435    }
436    (lines, words, first_is_word, in_word)
437}
438
439/// Count words in UTF-8 locale using 3-state logic matching GNU wc 9.4.
440///
441/// States: transparent (no state change), break (ends word), content (starts/continues word).
442///
443/// Handles:
444/// - ASCII spaces (0x09-0x0D, 0x20): word break
445/// - ASCII control chars (0x00-0x08, 0x0E-0x1F, 0x7F): transparent (don't start or break words)
446/// - ASCII printable (0x21-0x7E): word content
447/// - Valid UTF-8 multi-byte Unicode spaces (U+00A0, U+2000-U+200A, etc.): word break
448/// - Valid UTF-8 multi-byte printable chars: word content
449/// - Invalid UTF-8 encoding errors: transparent (matches GNU mbrtowc which skips
450///   1 byte on error without changing in_word state)
451fn count_words_utf8(data: &[u8]) -> u64 {
452    let mut words = 0u64;
453    let mut in_word = false;
454    let mut i = 0;
455    let len = data.len();
456
457    while i < len {
458        let b = unsafe { *data.get_unchecked(i) };
459
460        if b < 0x80 {
461            // ASCII byte — use 3-state classification matching GNU wc 9.4:
462            // printable (0x21-0x7E) = word content, space = break, control = transparent
463            let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
464            if class == 1 {
465                in_word = false;
466            } else if class == 2 && !in_word {
467                in_word = true;
468                words += 1;
469            }
470            // class == 0: transparent (control chars) — no state change
471            i += 1;
472        } else if b < 0xC2 {
473            // Invalid UTF-8: bare continuation byte (0x80-0xBF) or overlong (0xC0-0xC1)
474            // Encoding error is transparent — matches GNU mbrtowc which skips 1 byte
475            // without changing in_word state on error
476            i += 1;
477        } else if b < 0xE0 {
478            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
479                let cp = ((b as u32 & 0x1F) << 6)
480                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
481                if is_unicode_space(cp) {
482                    in_word = false;
483                } else if !in_word {
484                    in_word = true;
485                    words += 1;
486                }
487                i += 2;
488            } else {
489                // Incomplete sequence — transparent (skip 1 byte, no state change)
490                i += 1;
491            }
492        } else if b < 0xF0 {
493            if i + 2 < len
494                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
495                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
496            {
497                let cp = ((b as u32 & 0x0F) << 12)
498                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
499                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
500                if is_unicode_space(cp) {
501                    in_word = false;
502                } else if !in_word {
503                    in_word = true;
504                    words += 1;
505                }
506                i += 3;
507            } else {
508                // Incomplete sequence — transparent (skip 1 byte, no state change)
509                i += 1;
510            }
511        } else if b < 0xF5 {
512            if i + 3 < len
513                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
514                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
515                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
516            {
517                let cp = ((b as u32 & 0x07) << 18)
518                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
519                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
520                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
521                if is_unicode_space(cp) {
522                    in_word = false;
523                } else if !in_word {
524                    in_word = true;
525                    words += 1;
526                }
527                i += 4;
528            } else {
529                // Incomplete sequence — transparent (skip 1 byte, no state change)
530                i += 1;
531            }
532        } else {
533            // Invalid byte >= 0xF5 — transparent (skip, no state change)
534            i += 1;
535        }
536    }
537
538    words
539}
540
541/// Count lines and words using optimized strategies per locale.
542/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
543/// C locale: AVX2 SIMD fused counter when available, scalar fallback otherwise.
544pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
545    if utf8 {
546        count_lines_words_utf8_fused(data)
547    } else {
548        let (lines, words, _, _) = count_lw_c_chunk_fast(data);
549        (lines, words)
550    }
551}
552
553/// Fused lines+words counting in UTF-8 mode (single pass).
554/// Avoids separate memchr pass for newlines by counting them inline with words.
555/// Uses 3-state logic for ASCII (matching GNU wc 9.4) plus UTF-8 decoding:
556///   - Encoding errors are transparent (matching GNU mbrtowc which skips 1 byte
557///     on error without changing in_word state)
558///   - ASCII control chars are transparent (don't affect word state)
559///   - Printable ASCII and valid multi-byte chars are word content
560fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
561    let mut lines = 0u64;
562    let mut words = 0u64;
563    let mut in_word = false;
564    let mut i = 0;
565    let len = data.len();
566
567    while i < len {
568        let b = unsafe { *data.get_unchecked(i) };
569
570        if b == b'\n' {
571            lines += 1;
572            in_word = false;
573            i += 1;
574        } else if b < 0x80 {
575            // ASCII byte — use 3-state classification
576            let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
577            if class == 1 {
578                in_word = false;
579            } else if class == 2 && !in_word {
580                in_word = true;
581                words += 1;
582            }
583            // class == 0: transparent — no state change
584            i += 1;
585        } else if b < 0xC2 {
586            // Invalid UTF-8: bare continuation or overlong — transparent (skip, no state change)
587            i += 1;
588        } else if b < 0xE0 {
589            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
590                let cp = ((b as u32 & 0x1F) << 6)
591                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
592                if is_unicode_space(cp) {
593                    in_word = false;
594                } else if !in_word {
595                    in_word = true;
596                    words += 1;
597                }
598                i += 2;
599            } else {
600                // Incomplete sequence — transparent (skip 1 byte, no state change)
601                i += 1;
602            }
603        } else if b < 0xF0 {
604            if i + 2 < len
605                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
606                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
607            {
608                let cp = ((b as u32 & 0x0F) << 12)
609                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
610                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
611                if is_unicode_space(cp) {
612                    in_word = false;
613                } else if !in_word {
614                    in_word = true;
615                    words += 1;
616                }
617                i += 3;
618            } else {
619                // Incomplete sequence — transparent (skip 1 byte, no state change)
620                i += 1;
621            }
622        } else if b < 0xF5 {
623            if i + 3 < len
624                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
625                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
626                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
627            {
628                let cp = ((b as u32 & 0x07) << 18)
629                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
630                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
631                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
632                if is_unicode_space(cp) {
633                    in_word = false;
634                } else if !in_word {
635                    in_word = true;
636                    words += 1;
637                }
638                i += 4;
639            } else {
640                // Incomplete sequence — transparent (skip 1 byte, no state change)
641                i += 1;
642            }
643        } else {
644            // Invalid byte >= 0xF5 — transparent (skip, no state change)
645            i += 1;
646        }
647    }
648
649    (lines, words)
650}
651
652/// Count lines, words, and chars using optimized strategies per locale.
653pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
654    if utf8 {
655        // Fused single-pass for lines+words, then fast char-counting pass
656        let (lines, words) = count_lines_words_utf8_fused(data);
657        let chars = count_chars_utf8(data);
658        (lines, words, chars)
659    } else {
660        // C locale: use optimized fused lines+words, chars = byte count
661        let (lines, words) = count_lines_words(data, false);
662        (lines, words, data.len() as u64)
663    }
664}
665
666/// Count UTF-8 characters by counting non-continuation bytes.
667/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
668/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
669///
670/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
671/// Falls back to 64-byte block processing with popcount on other architectures.
672pub fn count_chars_utf8(data: &[u8]) -> u64 {
673    #[cfg(target_arch = "x86_64")]
674    {
675        if is_x86_feature_detected!("avx2") {
676            return unsafe { count_chars_utf8_avx2(data) };
677        }
678    }
679    count_chars_utf8_scalar(data)
680}
681
682/// AVX2 SIMD character counter: counts non-continuation bytes using
683/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
684/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
685#[cfg(target_arch = "x86_64")]
686#[target_feature(enable = "avx2")]
687unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
688    unsafe {
689        use std::arch::x86_64::*;
690
691        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
692        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
693        let ones = _mm256_set1_epi8(1);
694        let zero = _mm256_setzero_si256();
695
696        let mut total = 0u64;
697        let len = data.len();
698        let ptr = data.as_ptr();
699        let mut i = 0;
700        let mut acc = _mm256_setzero_si256();
701        let mut batch = 0u32;
702
703        while i + 32 <= len {
704            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
705            let masked = _mm256_and_si256(v, mask_c0);
706            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
707            let non_cont = _mm256_andnot_si256(is_cont, ones);
708            acc = _mm256_add_epi8(acc, non_cont);
709
710            batch += 1;
711            if batch >= 255 {
712                // Horizontal sum via PSADBW: sum u8 differences against zero
713                let sad = _mm256_sad_epu8(acc, zero);
714                let hi = _mm256_extracti128_si256(sad, 1);
715                let lo = _mm256_castsi256_si128(sad);
716                let sum = _mm_add_epi64(lo, hi);
717                let hi64 = _mm_unpackhi_epi64(sum, sum);
718                let t = _mm_add_epi64(sum, hi64);
719                total += _mm_cvtsi128_si64(t) as u64;
720                acc = _mm256_setzero_si256();
721                batch = 0;
722            }
723            i += 32;
724        }
725
726        // Final horizontal sum
727        if batch > 0 {
728            let sad = _mm256_sad_epu8(acc, zero);
729            let hi = _mm256_extracti128_si256(sad, 1);
730            let lo = _mm256_castsi256_si128(sad);
731            let sum = _mm_add_epi64(lo, hi);
732            let hi64 = _mm_unpackhi_epi64(sum, sum);
733            let t = _mm_add_epi64(sum, hi64);
734            total += _mm_cvtsi128_si64(t) as u64;
735        }
736
737        while i < len {
738            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
739            i += 1;
740        }
741
742        total
743    }
744}
745
746/// Scalar fallback for count_chars_utf8.
747fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
748    let mut count = 0u64;
749    let chunks = data.chunks_exact(64);
750    let remainder = chunks.remainder();
751
752    for chunk in chunks {
753        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
754        let mut any_high = 0u8;
755        let mut i = 0;
756        while i + 8 <= 64 {
757            unsafe {
758                any_high |= *chunk.get_unchecked(i);
759                any_high |= *chunk.get_unchecked(i + 1);
760                any_high |= *chunk.get_unchecked(i + 2);
761                any_high |= *chunk.get_unchecked(i + 3);
762                any_high |= *chunk.get_unchecked(i + 4);
763                any_high |= *chunk.get_unchecked(i + 5);
764                any_high |= *chunk.get_unchecked(i + 6);
765                any_high |= *chunk.get_unchecked(i + 7);
766            }
767            i += 8;
768        }
769        if any_high < 0x80 {
770            count += 64;
771            continue;
772        }
773
774        let mut char_mask = 0u64;
775        i = 0;
776        while i + 7 < 64 {
777            unsafe {
778                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
779                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
780                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
781                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
782                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
783                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
784                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
785                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
786            }
787            i += 8;
788        }
789        count += char_mask.count_ones() as u64;
790    }
791
792    for &b in remainder {
793        count += ((b & 0xC0) != 0x80) as u64;
794    }
795    count
796}
797
798/// Count characters in C/POSIX locale (each byte is one character).
799#[inline]
800pub fn count_chars_c(data: &[u8]) -> u64 {
801    data.len() as u64
802}
803
804/// Count characters, choosing behavior based on locale.
805#[inline]
806pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
807    if utf8 {
808        count_chars_utf8(data)
809    } else {
810        count_chars_c(data)
811    }
812}
813
814/// Detect if the current locale uses UTF-8 encoding.
815pub fn is_utf8_locale() -> bool {
816    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
817        if let Ok(val) = std::env::var(var) {
818            if !val.is_empty() {
819                let lower = val.to_ascii_lowercase();
820                return lower.contains("utf-8") || lower.contains("utf8");
821            }
822        }
823    }
824    false
825}
826
827/// Decode one UTF-8 character from a byte slice.
828/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
829#[inline]
830fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
831    let b0 = bytes[0];
832    if b0 < 0x80 {
833        return (b0 as u32, 1);
834    }
835    if b0 < 0xC2 {
836        // Continuation byte or overlong 2-byte — invalid as start
837        return (b0 as u32, 1);
838    }
839    if b0 < 0xE0 {
840        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
841            return (b0 as u32, 1);
842        }
843        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
844        return (cp, 2);
845    }
846    if b0 < 0xF0 {
847        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
848            return (b0 as u32, 1);
849        }
850        let cp =
851            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
852        return (cp, 3);
853    }
854    if b0 < 0xF5 {
855        if bytes.len() < 4
856            || bytes[1] & 0xC0 != 0x80
857            || bytes[2] & 0xC0 != 0x80
858            || bytes[3] & 0xC0 != 0x80
859        {
860            return (b0 as u32, 1);
861        }
862        let cp = ((b0 as u32 & 0x07) << 18)
863            | ((bytes[1] as u32 & 0x3F) << 12)
864            | ((bytes[2] as u32 & 0x3F) << 6)
865            | (bytes[3] as u32 & 0x3F);
866        return (cp, 4);
867    }
868    (b0 as u32, 1)
869}
870
871/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
872/// GNU wc uses wcwidth() which returns 0 for these. We must match.
873#[inline]
874fn is_zero_width(cp: u32) -> bool {
875    matches!(
876        cp,
877        0x0300..=0x036F   // Combining Diacritical Marks
878        | 0x0483..=0x0489 // Cyrillic combining marks
879        | 0x0591..=0x05BD // Hebrew combining marks
880        | 0x05BF
881        | 0x05C1..=0x05C2
882        | 0x05C4..=0x05C5
883        | 0x05C7
884        | 0x0600..=0x0605 // Arabic number signs
885        | 0x0610..=0x061A // Arabic combining marks
886        | 0x064B..=0x065F // Arabic combining marks
887        | 0x0670
888        | 0x06D6..=0x06DD
889        | 0x06DF..=0x06E4
890        | 0x06E7..=0x06E8
891        | 0x06EA..=0x06ED
892        | 0x070F
893        | 0x0711
894        | 0x0730..=0x074A
895        | 0x07A6..=0x07B0
896        | 0x07EB..=0x07F3
897        | 0x07FD
898        | 0x0816..=0x0819
899        | 0x081B..=0x0823
900        | 0x0825..=0x0827
901        | 0x0829..=0x082D
902        | 0x0859..=0x085B
903        | 0x08D3..=0x08E1
904        | 0x08E3..=0x0902
905        | 0x093A
906        | 0x093C
907        | 0x0941..=0x0948
908        | 0x094D
909        | 0x0951..=0x0957
910        | 0x0962..=0x0963
911        | 0x0981
912        | 0x09BC
913        | 0x09C1..=0x09C4
914        | 0x09CD
915        | 0x09E2..=0x09E3
916        | 0x09FE
917        | 0x0A01..=0x0A02
918        | 0x0A3C
919        | 0x0A41..=0x0A42
920        | 0x0A47..=0x0A48
921        | 0x0A4B..=0x0A4D
922        | 0x0A51
923        | 0x0A70..=0x0A71
924        | 0x0A75
925        | 0x0A81..=0x0A82
926        | 0x0ABC
927        | 0x0AC1..=0x0AC5
928        | 0x0AC7..=0x0AC8
929        | 0x0ACD
930        | 0x0AE2..=0x0AE3
931        | 0x0AFA..=0x0AFF
932        | 0x0B01
933        | 0x0B3C
934        | 0x0B3F
935        | 0x0B41..=0x0B44
936        | 0x0B4D
937        | 0x0B56
938        | 0x0B62..=0x0B63
939        | 0x0B82
940        | 0x0BC0
941        | 0x0BCD
942        | 0x0C00
943        | 0x0C04
944        | 0x0C3E..=0x0C40
945        | 0x0C46..=0x0C48
946        | 0x0C4A..=0x0C4D
947        | 0x0C55..=0x0C56
948        | 0x0C62..=0x0C63
949        | 0x0C81
950        | 0x0CBC
951        | 0x0CBF
952        | 0x0CC6
953        | 0x0CCC..=0x0CCD
954        | 0x0CE2..=0x0CE3
955        | 0x0D00..=0x0D01
956        | 0x0D3B..=0x0D3C
957        | 0x0D41..=0x0D44
958        | 0x0D4D
959        | 0x0D62..=0x0D63
960        | 0x0DCA
961        | 0x0DD2..=0x0DD4
962        | 0x0DD6
963        | 0x0E31
964        | 0x0E34..=0x0E3A
965        | 0x0E47..=0x0E4E
966        | 0x0EB1
967        | 0x0EB4..=0x0EBC
968        | 0x0EC8..=0x0ECD
969        | 0x0F18..=0x0F19
970        | 0x0F35
971        | 0x0F37
972        | 0x0F39
973        | 0x0F71..=0x0F7E
974        | 0x0F80..=0x0F84
975        | 0x0F86..=0x0F87
976        | 0x0F8D..=0x0F97
977        | 0x0F99..=0x0FBC
978        | 0x0FC6
979        | 0x102D..=0x1030
980        | 0x1032..=0x1037
981        | 0x1039..=0x103A
982        | 0x103D..=0x103E
983        | 0x1058..=0x1059
984        | 0x105E..=0x1060
985        | 0x1071..=0x1074
986        | 0x1082
987        | 0x1085..=0x1086
988        | 0x108D
989        | 0x109D
990        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
991        | 0x135D..=0x135F
992        | 0x1712..=0x1714
993        | 0x1732..=0x1734
994        | 0x1752..=0x1753
995        | 0x1772..=0x1773
996        | 0x17B4..=0x17B5
997        | 0x17B7..=0x17BD
998        | 0x17C6
999        | 0x17C9..=0x17D3
1000        | 0x17DD
1001        | 0x180B..=0x180D
1002        | 0x1885..=0x1886
1003        | 0x18A9
1004        | 0x1920..=0x1922
1005        | 0x1927..=0x1928
1006        | 0x1932
1007        | 0x1939..=0x193B
1008        | 0x1A17..=0x1A18
1009        | 0x1A1B
1010        | 0x1A56
1011        | 0x1A58..=0x1A5E
1012        | 0x1A60
1013        | 0x1A62
1014        | 0x1A65..=0x1A6C
1015        | 0x1A73..=0x1A7C
1016        | 0x1A7F
1017        | 0x1AB0..=0x1ABE
1018        | 0x1B00..=0x1B03
1019        | 0x1B34
1020        | 0x1B36..=0x1B3A
1021        | 0x1B3C
1022        | 0x1B42
1023        | 0x1B6B..=0x1B73
1024        | 0x1B80..=0x1B81
1025        | 0x1BA2..=0x1BA5
1026        | 0x1BA8..=0x1BA9
1027        | 0x1BAB..=0x1BAD
1028        | 0x1BE6
1029        | 0x1BE8..=0x1BE9
1030        | 0x1BED
1031        | 0x1BEF..=0x1BF1
1032        | 0x1C2C..=0x1C33
1033        | 0x1C36..=0x1C37
1034        | 0x1CD0..=0x1CD2
1035        | 0x1CD4..=0x1CE0
1036        | 0x1CE2..=0x1CE8
1037        | 0x1CED
1038        | 0x1CF4
1039        | 0x1CF8..=0x1CF9
1040        | 0x1DC0..=0x1DF9
1041        | 0x1DFB..=0x1DFF
1042        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
1043        | 0x202A..=0x202E // Bidi control chars
1044        | 0x2060..=0x2064 // Word joiner, invisible operators
1045        | 0x2066..=0x206F // Bidi isolates
1046        | 0x20D0..=0x20F0 // Combining marks for symbols
1047        | 0xFE00..=0xFE0F // Variation Selectors
1048        | 0xFE20..=0xFE2F // Combining Half Marks
1049        | 0xFEFF          // Zero Width No-Break Space (BOM)
1050        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
1051        | 0x1D167..=0x1D169
1052        | 0x1D173..=0x1D182
1053        | 0x1D185..=0x1D18B
1054        | 0x1D1AA..=0x1D1AD
1055        | 0x1D242..=0x1D244
1056        | 0xE0001
1057        | 0xE0020..=0xE007F
1058        | 0xE0100..=0xE01EF // Variation Selectors Supplement
1059    )
1060}
1061
1062/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
1063/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
1064#[inline]
1065fn is_wide_char(cp: u32) -> bool {
1066    matches!(
1067        cp,
1068        0x1100..=0x115F   // Hangul Jamo
1069        | 0x231A..=0x231B // Watch, Hourglass
1070        | 0x2329..=0x232A // Angle Brackets
1071        | 0x23E9..=0x23F3 // Various symbols
1072        | 0x23F8..=0x23FA
1073        | 0x25FD..=0x25FE
1074        | 0x2614..=0x2615
1075        | 0x2648..=0x2653
1076        | 0x267F
1077        | 0x2693
1078        | 0x26A1
1079        | 0x26AA..=0x26AB
1080        | 0x26BD..=0x26BE
1081        | 0x26C4..=0x26C5
1082        | 0x26CE
1083        | 0x26D4
1084        | 0x26EA
1085        | 0x26F2..=0x26F3
1086        | 0x26F5
1087        | 0x26FA
1088        | 0x26FD
1089        | 0x2702
1090        | 0x2705
1091        | 0x2708..=0x270D
1092        | 0x270F
1093        | 0x2712
1094        | 0x2714
1095        | 0x2716
1096        | 0x271D
1097        | 0x2721
1098        | 0x2728
1099        | 0x2733..=0x2734
1100        | 0x2744
1101        | 0x2747
1102        | 0x274C
1103        | 0x274E
1104        | 0x2753..=0x2755
1105        | 0x2757
1106        | 0x2763..=0x2764
1107        | 0x2795..=0x2797
1108        | 0x27A1
1109        | 0x27B0
1110        | 0x27BF
1111        | 0x2934..=0x2935
1112        | 0x2B05..=0x2B07
1113        | 0x2B1B..=0x2B1C
1114        | 0x2B50
1115        | 0x2B55
1116        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
1117        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
1118        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1119        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1120        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1121        | 0xAC00..=0xD7A3  // Hangul Syllables
1122        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1123        | 0xFE10..=0xFE19  // Vertical Forms
1124        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1125        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1126        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1127        | 0x1F004
1128        | 0x1F0CF
1129        | 0x1F170..=0x1F171
1130        | 0x1F17E..=0x1F17F
1131        | 0x1F18E
1132        | 0x1F191..=0x1F19A
1133        | 0x1F1E0..=0x1F1FF // Regional Indicators
1134        | 0x1F200..=0x1F202
1135        | 0x1F210..=0x1F23B
1136        | 0x1F240..=0x1F248
1137        | 0x1F250..=0x1F251
1138        | 0x1F260..=0x1F265
1139        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1140        | 0x1F680..=0x1F6FF // Transport Symbols
1141        | 0x1F900..=0x1F9FF // Supplemental Symbols
1142        | 0x1FA00..=0x1FA6F
1143        | 0x1FA70..=0x1FAFF
1144        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1145        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1146    )
1147}
1148
1149/// Compute maximum display width of any line (C/POSIX locale).
1150///
1151/// GNU wc -L behavior in C locale:
1152/// - `\n`: line terminator (records max, resets position)
1153/// - `\t`: advances to next tab stop (multiple of 8)
1154/// - `\r`: carriage return (resets position to 0, same line)
1155/// - `\f`: form feed (acts as line terminator like \n)
1156/// - Printable ASCII (0x20..0x7E): width 1
1157/// - Everything else (controls, high bytes): width 0
1158///
1159/// Optimized with printable ASCII run counting: for runs of bytes in
1160/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1161pub fn max_line_length_c(data: &[u8]) -> u64 {
1162    let mut max_len: u64 = 0;
1163    let mut line_len: u64 = 0;
1164    let mut linepos: u64 = 0;
1165    let mut i = 0;
1166    let len = data.len();
1167
1168    while i < len {
1169        let b = unsafe { *data.get_unchecked(i) };
1170        if b >= 0x21 && b <= 0x7E {
1171            // Printable non-space ASCII — count run length
1172            i += 1;
1173            let mut run = 1u64;
1174            while i < len {
1175                let b = unsafe { *data.get_unchecked(i) };
1176                if b >= 0x21 && b <= 0x7E {
1177                    run += 1;
1178                    i += 1;
1179                } else {
1180                    break;
1181                }
1182            }
1183            linepos += run;
1184            if linepos > line_len {
1185                line_len = linepos;
1186            }
1187        } else {
1188            match b {
1189                b' ' => {
1190                    linepos += 1;
1191                    if linepos > line_len {
1192                        line_len = linepos;
1193                    }
1194                }
1195                b'\n' => {
1196                    if line_len > max_len {
1197                        max_len = line_len;
1198                    }
1199                    linepos = 0;
1200                    line_len = 0;
1201                }
1202                b'\t' => {
1203                    linepos = (linepos + 8) & !7;
1204                    if linepos > line_len {
1205                        line_len = linepos;
1206                    }
1207                }
1208                b'\r' => {
1209                    linepos = 0;
1210                }
1211                0x0C => {
1212                    if line_len > max_len {
1213                        max_len = line_len;
1214                    }
1215                    linepos = 0;
1216                    line_len = 0;
1217                }
1218                _ => {} // Non-printable: width 0
1219            }
1220            i += 1;
1221        }
1222    }
1223
1224    if line_len > max_len {
1225        max_len = line_len;
1226    }
1227
1228    max_len
1229}
1230
1231/// Compute maximum display width of any line (UTF-8 locale).
1232///
1233/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1234/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1235///
1236/// Optimized with printable ASCII run counting for common text.
1237pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1238    let mut max_len: u64 = 0;
1239    let mut line_len: u64 = 0;
1240    let mut linepos: u64 = 0;
1241    let mut i = 0;
1242    let len = data.len();
1243
1244    while i < len {
1245        let b = unsafe { *data.get_unchecked(i) };
1246
1247        if b >= 0x21 && b <= 0x7E {
1248            // Printable non-space ASCII (most common) — count run length
1249            i += 1;
1250            let mut run = 1u64;
1251            while i < len {
1252                let b = unsafe { *data.get_unchecked(i) };
1253                if b >= 0x21 && b <= 0x7E {
1254                    run += 1;
1255                    i += 1;
1256                } else {
1257                    break;
1258                }
1259            }
1260            linepos += run;
1261            if linepos > line_len {
1262                line_len = linepos;
1263            }
1264        } else if b < 0x80 {
1265            // Other ASCII: space, tab, newline, controls
1266            match b {
1267                b' ' => {
1268                    linepos += 1;
1269                    if linepos > line_len {
1270                        line_len = linepos;
1271                    }
1272                }
1273                b'\n' => {
1274                    if line_len > max_len {
1275                        max_len = line_len;
1276                    }
1277                    linepos = 0;
1278                    line_len = 0;
1279                }
1280                b'\t' => {
1281                    linepos = (linepos + 8) & !7;
1282                    if linepos > line_len {
1283                        line_len = linepos;
1284                    }
1285                }
1286                b'\r' => {
1287                    linepos = 0;
1288                }
1289                0x0C => {
1290                    if line_len > max_len {
1291                        max_len = line_len;
1292                    }
1293                    linepos = 0;
1294                    line_len = 0;
1295                }
1296                _ => {} // Non-printable: width 0
1297            }
1298            i += 1;
1299        } else {
1300            // Multibyte UTF-8
1301            let (cp, len) = decode_utf8(&data[i..]);
1302
1303            // C1 control characters (0x80..0x9F): non-printable, width 0
1304            if cp <= 0x9F {
1305                // width 0
1306            } else if is_zero_width(cp) {
1307                // Combining marks, zero-width chars: width 0
1308            } else if is_wide_char(cp) {
1309                linepos += 2;
1310                if linepos > line_len {
1311                    line_len = linepos;
1312                }
1313            } else {
1314                // Regular printable Unicode character: width 1
1315                linepos += 1;
1316                if linepos > line_len {
1317                    line_len = linepos;
1318                }
1319            }
1320            i += len;
1321        }
1322    }
1323
1324    // Handle last line
1325    if line_len > max_len {
1326        max_len = line_len;
1327    }
1328
1329    max_len
1330}
1331
1332/// Compute maximum display width, choosing behavior based on locale.
1333#[inline]
1334pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1335    if utf8 {
1336        max_line_length_utf8(data)
1337    } else {
1338        max_line_length_c(data)
1339    }
1340}
1341
1342/// Count all metrics using optimized individual passes.
1343///
1344/// Each metric uses its own optimized algorithm:
1345/// - Lines: SIMD-accelerated memchr
1346/// - Words: 3-state scalar/state-machine (locale-dependent)
1347/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1348/// - Max line length: locale-aware display width tracking
1349///
1350/// Multi-pass is faster than single-pass because each pass has a tight,
1351/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1352/// making subsequent passes nearly free for memory bandwidth.
1353pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1354    if utf8 {
1355        let (lines, words) = count_lines_words_utf8_fused(data);
1356        WcCounts {
1357            lines,
1358            words,
1359            bytes: data.len() as u64,
1360            chars: count_chars_utf8(data),
1361            max_line_length: max_line_length_utf8(data),
1362        }
1363    } else {
1364        WcCounts {
1365            lines: count_lines(data),
1366            words: count_words_locale(data, false),
1367            bytes: data.len() as u64,
1368            chars: data.len() as u64,
1369            max_line_length: max_line_length_c(data),
1370        }
1371    }
1372}
1373
1374/// Quick check if data is likely all-ASCII by sampling three regions.
1375/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1376/// If any byte >= 0x80 is found, returns false.
1377#[inline]
1378fn check_ascii_sample(data: &[u8]) -> bool {
1379    let len = data.len();
1380    if len == 0 {
1381        return true;
1382    }
1383
1384    // Check in 8-byte blocks using OR-accumulation for speed
1385    let check_region = |start: usize, end: usize| -> bool {
1386        let mut or_acc = 0u8;
1387        let region = &data[start..end];
1388        let mut i = 0;
1389        while i + 8 <= region.len() {
1390            unsafe {
1391                or_acc |= *region.get_unchecked(i);
1392                or_acc |= *region.get_unchecked(i + 1);
1393                or_acc |= *region.get_unchecked(i + 2);
1394                or_acc |= *region.get_unchecked(i + 3);
1395                or_acc |= *region.get_unchecked(i + 4);
1396                or_acc |= *region.get_unchecked(i + 5);
1397                or_acc |= *region.get_unchecked(i + 6);
1398                or_acc |= *region.get_unchecked(i + 7);
1399            }
1400            i += 8;
1401        }
1402        while i < region.len() {
1403            or_acc |= region[i];
1404            i += 1;
1405        }
1406        or_acc < 0x80
1407    };
1408
1409    let sample = 256.min(len);
1410
1411    // Check beginning
1412    if !check_region(0, sample) {
1413        return false;
1414    }
1415    // Check middle
1416    if len > sample * 2 {
1417        let mid = len / 2;
1418        let mid_start = mid.saturating_sub(sample / 2);
1419        if !check_region(mid_start, (mid_start + sample).min(len)) {
1420            return false;
1421        }
1422    }
1423    // Check end
1424    if len > sample {
1425        if !check_region(len - sample, len) {
1426            return false;
1427        }
1428    }
1429
1430    true
1431}
1432
1433// ──────────────────────────────────────────────────
1434// Parallel counting for large files
1435// ──────────────────────────────────────────────────
1436
1437/// Split data into chunks at newline boundaries for parallel processing.
1438/// Returns slices where each slice (except possibly the last) ends with `\n`.
1439/// Splitting at newlines guarantees word boundaries in any locale,
1440/// enabling safe parallel word counting without boundary adjustment.
1441fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1442    if data.is_empty() || num_chunks <= 1 {
1443        return vec![data];
1444    }
1445    let chunk_size = data.len() / num_chunks;
1446    let mut chunks = Vec::with_capacity(num_chunks);
1447    let mut pos = 0;
1448
1449    for _ in 0..num_chunks - 1 {
1450        let target = pos + chunk_size;
1451        if target >= data.len() {
1452            break;
1453        }
1454        let boundary = memchr::memchr(b'\n', &data[target..])
1455            .map(|p| target + p + 1)
1456            .unwrap_or(data.len());
1457        if boundary > pos {
1458            chunks.push(&data[pos..boundary]);
1459        }
1460        pos = boundary;
1461    }
1462    if pos < data.len() {
1463        chunks.push(&data[pos..]);
1464    }
1465    chunks
1466}
1467
1468/// Count newlines in parallel using SIMD memchr + rayon.
1469/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1470pub fn count_lines_parallel(data: &[u8]) -> u64 {
1471    if data.len() < PARALLEL_THRESHOLD {
1472        return count_lines(data);
1473    }
1474
1475    let num_threads = rayon::current_num_threads().max(1);
1476    // Ensure chunks are large enough to amortize SIMD setup overhead
1477    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1478
1479    data.par_chunks(chunk_size)
1480        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1481        .sum()
1482}
1483
1484/// Count words in parallel with boundary adjustment.
1485pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1486    if data.len() < PARALLEL_THRESHOLD {
1487        return count_words_locale(data, utf8);
1488    }
1489
1490    let num_threads = rayon::current_num_threads().max(1);
1491
1492    if utf8 {
1493        // UTF-8: split at newline boundaries for safe parallel word counting.
1494        // Newlines are always word boundaries, so no boundary adjustment needed.
1495        let chunks = split_at_newlines(data, num_threads);
1496        chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1497    } else {
1498        // C locale: parallel 3-state word counting with boundary adjustment
1499        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1500
1501        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1502
1503        // Each chunk returns (lines, word_count, first_active_is_printable, ends_in_word)
1504        let results: Vec<(u64, u64, bool, bool)> = chunks
1505            .par_iter()
1506            .map(|chunk| count_lw_c_chunk(chunk))
1507            .collect();
1508
1509        let mut total = 0u64;
1510        for i in 0..results.len() {
1511            total += results[i].1;
1512            // Boundary adjustment: if previous chunk ended in_word AND
1513            // current chunk's first non-transparent byte is printable,
1514            // the word was split across chunks — subtract the overcount.
1515            if i > 0 && results[i - 1].3 && results[i].2 {
1516                total -= 1;
1517            }
1518        }
1519        total
1520    }
1521}
1522
1523/// Count UTF-8 characters in parallel.
1524pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1525    if !utf8 {
1526        return data.len() as u64;
1527    }
1528    if data.len() < PARALLEL_THRESHOLD {
1529        return count_chars_utf8(data);
1530    }
1531
1532    let num_threads = rayon::current_num_threads().max(1);
1533    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1534
1535    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1536}
1537
1538/// Count lines + words + bytes in a single fused pass (the default wc mode).
1539/// Avoids separate passes entirely — combines newline counting with word detection.
1540pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1541    let (lines, words) = count_lines_words(data, utf8);
1542    (lines, words, data.len() as u64)
1543}
1544
1545/// Parallel counting of lines + words + bytes only (no chars).
1546/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1547/// C locale: single fused pass per chunk counts BOTH lines and words.
1548/// UTF-8: checks ASCII first for C locale fast path, else splits at newlines
1549/// for safe parallel UTF-8 word counting.
1550pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1551    if data.len() < PARALLEL_THRESHOLD {
1552        // Small file: use fused single-pass
1553        return count_lwb(data, utf8);
1554    }
1555
1556    let num_threads = rayon::current_num_threads().max(1);
1557
1558    let (lines, words) = if !utf8 {
1559        // C locale: FUSED parallel lines+words counting — single pass per chunk
1560        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1561
1562        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1563        let results: Vec<(u64, u64, bool, bool)> = chunks
1564            .par_iter()
1565            .map(|chunk| count_lw_c_chunk_fast(chunk))
1566            .collect();
1567
1568        let mut line_total = 0u64;
1569        let mut word_total = 0u64;
1570        for i in 0..results.len() {
1571            line_total += results[i].0;
1572            word_total += results[i].1;
1573            if i > 0 && results[i - 1].3 && results[i].2 {
1574                word_total -= 1;
1575            }
1576        }
1577
1578        (line_total, word_total)
1579    } else {
1580        // UTF-8 locale: check if ASCII for faster C locale path
1581        let is_ascii = check_ascii_sample(data);
1582        if is_ascii {
1583            // Pure ASCII: use C locale parallel path (arbitrary chunks OK)
1584            let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1585            let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1586            let results: Vec<(u64, u64, bool, bool)> = chunks
1587                .par_iter()
1588                .map(|chunk| count_lw_c_chunk_fast(chunk))
1589                .collect();
1590
1591            let mut line_total = 0u64;
1592            let mut word_total = 0u64;
1593            for i in 0..results.len() {
1594                line_total += results[i].0;
1595                word_total += results[i].1;
1596                if i > 0 && results[i - 1].3 && results[i].2 {
1597                    word_total -= 1;
1598                }
1599            }
1600            (line_total, word_total)
1601        } else {
1602            // Non-ASCII UTF-8: split at newline boundaries for safe parallel
1603            // word counting. Newlines always break words, so no adjustment needed.
1604            let chunks = split_at_newlines(data, num_threads);
1605            let results: Vec<(u64, u64)> = chunks
1606                .par_iter()
1607                .map(|chunk| count_lines_words_utf8_fused(chunk))
1608                .collect();
1609            let mut line_total = 0u64;
1610            let mut word_total = 0u64;
1611            for (l, w) in results {
1612                line_total += l;
1613                word_total += w;
1614            }
1615            (line_total, word_total)
1616        }
1617    };
1618
1619    (lines, words, data.len() as u64)
1620}
1621
1622/// Combined parallel counting of lines + words + chars.
1623/// UTF-8: splits at newline boundaries for fused lines+words+chars per chunk.
1624/// C locale: fused parallel lines+words with boundary adjustment + parallel chars.
1625pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1626    if data.len() < PARALLEL_THRESHOLD {
1627        let lines = count_lines(data);
1628        let words = count_words_locale(data, utf8);
1629        let chars = count_chars(data, utf8);
1630        return (lines, words, chars);
1631    }
1632
1633    let num_threads = rayon::current_num_threads().max(1);
1634
1635    if utf8 {
1636        // UTF-8: fused parallel lines+words+chars per chunk (split at newlines)
1637        let chunks = split_at_newlines(data, num_threads);
1638        let results: Vec<(u64, u64, u64)> = chunks
1639            .par_iter()
1640            .map(|chunk| {
1641                let (lines, words) = count_lines_words_utf8_fused(chunk);
1642                let chars = count_chars_utf8(chunk);
1643                (lines, words, chars)
1644            })
1645            .collect();
1646        let mut lines = 0u64;
1647        let mut words = 0u64;
1648        let mut chars = 0u64;
1649        for (l, w, c) in results {
1650            lines += l;
1651            words += w;
1652            chars += c;
1653        }
1654        (lines, words, chars)
1655    } else {
1656        // C locale: fused parallel lines+words + parallel chars (= byte count)
1657        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1658        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1659        let results: Vec<(u64, u64, bool, bool)> = chunks
1660            .par_iter()
1661            .map(|chunk| count_lw_c_chunk_fast(chunk))
1662            .collect();
1663        let mut lines = 0u64;
1664        let mut words = 0u64;
1665        for i in 0..results.len() {
1666            lines += results[i].0;
1667            words += results[i].1;
1668            if i > 0 && results[i - 1].3 && results[i].2 {
1669                words -= 1;
1670            }
1671        }
1672        (lines, words, data.len() as u64)
1673    }
1674}
1675
1676/// Parallel max line length computation.
1677/// Splits at newline boundaries so each chunk independently computes correct
1678/// max line width (since newlines reset position tracking).
1679pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1680    if data.len() < PARALLEL_THRESHOLD {
1681        return max_line_length(data, utf8);
1682    }
1683    let num_threads = rayon::current_num_threads().max(1);
1684    let chunks = split_at_newlines(data, num_threads);
1685    chunks
1686        .par_iter()
1687        .map(|chunk| {
1688            if utf8 {
1689                max_line_length_utf8(chunk)
1690            } else {
1691                max_line_length_c(chunk)
1692            }
1693        })
1694        .max()
1695        .unwrap_or(0)
1696}
1697
1698/// Parallel counting of all metrics at once.
1699/// Splits at newline boundaries for safe parallel word + max_line_length counting.
1700/// Each chunk computes all metrics in a single traversal group, maximizing cache reuse.
1701pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1702    if data.len() < PARALLEL_THRESHOLD {
1703        return count_all(data, utf8);
1704    }
1705
1706    let num_threads = rayon::current_num_threads().max(1);
1707    let chunks = split_at_newlines(data, num_threads);
1708
1709    if utf8 {
1710        let results: Vec<(u64, u64, u64, u64)> = chunks
1711            .par_iter()
1712            .map(|chunk| {
1713                let (lines, words) = count_lines_words_utf8_fused(chunk);
1714                let chars = count_chars_utf8(chunk);
1715                let max_ll = max_line_length_utf8(chunk);
1716                (lines, words, chars, max_ll)
1717            })
1718            .collect();
1719
1720        let mut counts = WcCounts {
1721            bytes: data.len() as u64,
1722            ..Default::default()
1723        };
1724        for (l, w, c, m) in results {
1725            counts.lines += l;
1726            counts.words += w;
1727            counts.chars += c;
1728            if m > counts.max_line_length {
1729                counts.max_line_length = m;
1730            }
1731        }
1732        counts
1733    } else {
1734        // C locale: fused lines+words per chunk + max_line_length per chunk
1735        let results: Vec<(u64, u64, u64)> = chunks
1736            .par_iter()
1737            .map(|chunk| {
1738                let (lines, words) = count_lines_words(chunk, false);
1739                let max_ll = max_line_length_c(chunk);
1740                (lines, words, max_ll)
1741            })
1742            .collect();
1743
1744        let mut counts = WcCounts {
1745            bytes: data.len() as u64,
1746            chars: data.len() as u64,
1747            ..Default::default()
1748        };
1749        for (l, w, m) in &results {
1750            counts.lines += l;
1751            counts.words += w;
1752            if *m > counts.max_line_length {
1753                counts.max_line_length = *m;
1754            }
1755        }
1756        counts
1757    }
1758}