Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (1MB).
5/// Rayon overhead is ~5-10μs per task; at 1MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~100μs, so overhead is < 10%.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// Byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// GNU wc 9.4 uses a 3-state model for word counting in UTF-8 locales:
24//   - Space (word-break): whitespace bytes (0x09-0x0D, 0x20, 0xA0)
25//   - Printable (word content): printable characters (ASCII 0x21-0x7E, valid Unicode)
26//   - Transparent (no state change): NUL, control chars, DEL, invalid/overlong
27//     UTF-8, and non-printable Unicode characters
28//
29// In the C locale fast path (IS_SPACE table):
30//   - 0x09-0x0D, 0x20: whitespace
31//   - 0xA0: whitespace (NBSP via glibc Latin-1 identity mapping)
32//   - All other high bytes (0x80-0xFF except 0xA0): not whitespace
33//
34// In UTF-8 locale with multibyte path:
35//   - ASCII bytes use the IS_SPACE table
36//   - Valid multibyte chars: iswspace() for space, iswprint() for word content
37//   - Non-printable Unicode: transparent (no state change)
38//   - Encoding errors (EILSEQ): transparent (no state change)
39
40/// Byte-level space table matching GNU wc 9.7 `wc_isspace[]`.
41/// true = whitespace (word break), false = word content.
42/// Includes the 6 standard C locale whitespace bytes plus byte 0xa0 (NBSP).
43///
44/// GNU wc 9.7 uses: `wc_isspace[i] = isspace(i) || iswnbspace(btoc32(i))`
45/// where iswnbspace returns true for U+00A0 (NBSP) when POSIXLY_CORRECT is not set.
46/// In glibc's C locale, btoc32(0xa0) maps to U+00A0 via Latin-1 identity mapping.
47const fn make_is_space() -> [bool; 256] {
48    let mut t = [false; 256];
49    t[0x09] = true; // tab
50    t[0x0A] = true; // newline
51    t[0x0B] = true; // vertical tab
52    t[0x0C] = true; // form feed
53    t[0x0D] = true; // carriage return
54    t[0x20] = true; // space
55    t[0xA0] = true; // NBSP (non-breaking space, matches GNU wc iswnbspace)
56    t
57}
58const IS_SPACE: [bool; 256] = make_is_space();
59
60/// For parallel chunk merging: determine if a chunk starts with word content
61/// (i.e., the first byte is not whitespace).
62#[inline]
63pub(crate) fn first_is_word(data: &[u8]) -> bool {
64    !data.is_empty() && !IS_SPACE[data[0] as usize]
65}
66
67// ──────────────────────────────────────────────────
68// Unicode character classification helpers
69// ──────────────────────────────────────────────────
70
71/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
72/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
73#[inline]
74fn is_unicode_space(cp: u32) -> bool {
75    matches!(
76        cp,
77        0x1680 |           // Ogham Space Mark
78        0x2000
79            ..=0x200A |  // En Quad through Hair Space
80        0x2028 |           // Line Separator
81        0x2029 |           // Paragraph Separator
82        0x205F |           // Medium Mathematical Space
83        0x3000 // Ideographic Space
84    )
85}
86
87/// Check if a Unicode codepoint is a "non-breaking space" that GNU wc treats
88/// as a word separator (when POSIXLY_CORRECT is not set).
89/// Matches GNU wc 9.7 iswnbspace(): U+00A0, U+2007, U+202F, U+2060.
90#[inline]
91fn is_wnbspace(cp: u32) -> bool {
92    matches!(cp, 0x00A0 | 0x2007 | 0x202F | 0x2060)
93}
94
95/// Check if a Unicode codepoint is any kind of space (iswspace || iswnbspace).
96#[inline]
97fn is_unicode_word_break(cp: u32) -> bool {
98    is_unicode_space(cp) || is_wnbspace(cp)
99}
100
101/// Check if a Unicode codepoint is "printable" for the 3-state word counting model.
102/// Matches glibc's iswprint(): true for graphic characters and space-like characters,
103/// false for control characters and unassigned/private-use.
104/// In practice, almost all valid Unicode codepoints >= 0x80 that aren't spaces are printable.
105#[inline]
106fn is_printable_unicode(cp: u32) -> bool {
107    // Categories NOT printable: C0/C1 controls, surrogates, noncharacters, unassigned above Plane 16
108    // For word counting purposes, we consider all valid Unicode >= 0xA0 that isn't
109    // a control character as printable. This matches glibc iswprint() for common text.
110    if cp < 0xA0 {
111        // U+0000-0x001F: C0 controls (not printable)
112        // U+0020-0x007E: ASCII printable (handled separately in ASCII path)
113        // U+007F: DEL (not printable)
114        // U+0080-0x009F: C1 controls (not printable)
115        return false;
116    }
117    // Surrogates (U+D800-U+DFFF) and noncharacters shouldn't appear in valid UTF-8,
118    // but mark them non-printable for safety
119    if (0xD800..=0xDFFF).contains(&cp) || cp > 0x10FFFF {
120        return false;
121    }
122    // Unicode noncharacters: U+FDD0-U+FDEF, U+xFFFE-U+xFFFF
123    if (0xFDD0..=0xFDEF).contains(&cp) || (cp & 0xFFFE) == 0xFFFE {
124        return false;
125    }
126    true
127}
128
129// ──────────────────────────────────────────────────
130// Core counting functions
131// ──────────────────────────────────────────────────
132
133/// Count newlines using SIMD-accelerated memchr.
134/// GNU wc counts newline bytes (`\n`), not logical lines.
135#[inline]
136pub fn count_lines(data: &[u8]) -> u64 {
137    memchr_iter(b'\n', data).count() as u64
138}
139
140/// Count bytes. Trivial but included for API consistency.
141#[inline]
142pub fn count_bytes(data: &[u8]) -> u64 {
143    data.len() as u64
144}
145
146/// Count words using locale-aware 2-state logic (default: UTF-8).
147pub fn count_words(data: &[u8]) -> u64 {
148    count_words_locale(data, true)
149}
150
151/// Count words with explicit locale control using 2-state logic.
152///
153/// GNU wc classifies each byte/character as:
154///   - space (whitespace): sets in_word=false
155///   - word content (everything else): sets in_word=true, increments word count on transition
156pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
157    if utf8 {
158        count_words_utf8(data)
159    } else {
160        count_words_c(data)
161    }
162}
163
164/// Count words in C/POSIX locale using 2-state logic.
165/// Every byte is either whitespace (0x09-0x0D, 0x20, 0xA0) or word content.
166/// NUL bytes, control chars, DEL, and high bytes (0x80-0xFF except 0xA0) are word content.
167fn count_words_c(data: &[u8]) -> u64 {
168    let mut words = 0u64;
169    let mut in_word = false;
170    let mut i = 0;
171    let len = data.len();
172
173    while i < len {
174        let b = unsafe { *data.get_unchecked(i) };
175        if IS_SPACE[b as usize] {
176            in_word = false;
177        } else if !in_word {
178            in_word = true;
179            words += 1;
180        }
181        i += 1;
182    }
183    words
184}
185
186/// Scalar tail for SIMD line+word counters: processes remaining bytes after
187/// the SIMD loop and returns final counts with boundary info.
188/// SAFETY: caller must ensure ptr is valid for [0..len) and i <= len.
189#[cfg(target_arch = "x86_64")]
190#[inline(always)]
191fn count_lw_c_scalar_tail(
192    ptr: *const u8,
193    mut i: usize,
194    len: usize,
195    mut total_lines: u64,
196    mut total_words: u64,
197    mut prev_in_word: bool,
198    data: &[u8],
199) -> (u64, u64, bool, bool) {
200    while i < len {
201        let b = unsafe { *ptr.add(i) };
202        if IS_SPACE[b as usize] {
203            if b == b'\n' {
204                total_lines += 1;
205            }
206            prev_in_word = false;
207        } else if !prev_in_word {
208            total_words += 1;
209            prev_in_word = true;
210        }
211        i += 1;
212    }
213    let first_word = first_is_word(data);
214    (total_lines, total_words, first_word, prev_in_word)
215}
216
217/// AVX2-accelerated fused line+word counter for C locale chunks.
218/// Processes 32 bytes per iteration using 2-state logic matching GNU wc 9.7:
219///   - Space: {0x09-0x0D, 0x20} (6 bytes) — ends word
220///   - Non-space: everything else — starts/continues word
221/// Word transitions detected via bitmask: space-to-nonspace transitions.
222#[cfg(target_arch = "x86_64")]
223#[target_feature(enable = "avx2")]
224unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
225    use std::arch::x86_64::*;
226
227    let len = data.len();
228    let ptr = data.as_ptr();
229    let mut i = 0usize;
230    let mut total_lines = 0u64;
231    let mut total_words = 0u64;
232    let mut prev_in_word = false;
233
234    unsafe {
235        let nl_byte = _mm256_set1_epi8(b'\n' as i8);
236        let zero = _mm256_setzero_si256();
237        let ones = _mm256_set1_epi8(1);
238        // Space detection: {0x09-0x0D, 0x20, 0xA0}
239        let const_0x09 = _mm256_set1_epi8(0x09u8 as i8);
240        let const_0x0d = _mm256_set1_epi8(0x0Du8 as i8);
241        let const_0x20 = _mm256_set1_epi8(0x20u8 as i8);
242        let const_0xa0 = _mm256_set1_epi8(0xA0u8 as i8);
243
244        let mut line_acc = _mm256_setzero_si256();
245        let mut batch = 0u32;
246
247        while i + 32 <= len {
248            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
249            let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
250            line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
251
252            // Space check: byte in {0x09-0x0D, 0x20, 0xA0}
253            let ge_09 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x09), v);
254            let le_0d = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x0d), v);
255            let in_tab_range = _mm256_and_si256(ge_09, le_0d);
256            let is_sp = _mm256_cmpeq_epi8(v, const_0x20);
257            let is_nbsp = _mm256_cmpeq_epi8(v, const_0xa0);
258            let is_space = _mm256_or_si256(_mm256_or_si256(in_tab_range, is_sp), is_nbsp);
259            let space_mask = _mm256_movemask_epi8(is_space) as u32;
260
261            // 2-state: non-space = word content, space = break
262            // Word starts = positions where byte is non-space AND previous byte was space
263            let nonspace_mask = !space_mask;
264            // Build "previous was space" mask: shift space_mask left by 1, inject prev state
265            let prev_space = (space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 };
266            let starts = nonspace_mask & prev_space;
267            total_words += starts.count_ones() as u64;
268
269            // Update prev_in_word: last byte of this chunk is non-space?
270            prev_in_word = (nonspace_mask >> 31) & 1 == 1;
271
272            batch += 1;
273            if batch >= 255 {
274                let sad = _mm256_sad_epu8(line_acc, zero);
275                let hi = _mm256_extracti128_si256(sad, 1);
276                let lo = _mm256_castsi256_si128(sad);
277                let s = _mm_add_epi64(lo, hi);
278                let h64 = _mm_unpackhi_epi64(s, s);
279                let t = _mm_add_epi64(s, h64);
280                total_lines += _mm_cvtsi128_si64(t) as u64;
281                line_acc = _mm256_setzero_si256();
282                batch = 0;
283            }
284            i += 32;
285        }
286
287        if batch > 0 {
288            let sad = _mm256_sad_epu8(line_acc, zero);
289            let hi = _mm256_extracti128_si256(sad, 1);
290            let lo = _mm256_castsi256_si128(sad);
291            let s = _mm_add_epi64(lo, hi);
292            let h64 = _mm_unpackhi_epi64(s, s);
293            let t = _mm_add_epi64(s, h64);
294            total_lines += _mm_cvtsi128_si64(t) as u64;
295        }
296    }
297
298    count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
299}
300
301/// SSE2 variant of count_lw_c_chunk_avx2 — processes 16 bytes per iteration.
302/// See AVX2 function above for algorithm details.
303#[cfg(target_arch = "x86_64")]
304#[target_feature(enable = "sse2")]
305unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
306    use std::arch::x86_64::*;
307
308    let len = data.len();
309    let ptr = data.as_ptr();
310    let mut i = 0usize;
311    let mut total_lines = 0u64;
312    let mut total_words = 0u64;
313    let mut prev_in_word = false;
314
315    unsafe {
316        let nl_byte = _mm_set1_epi8(b'\n' as i8);
317        let zero = _mm_setzero_si128();
318        let ones = _mm_set1_epi8(1);
319        // Space detection: {0x09-0x0D, 0x20, 0xA0}
320        let const_0x09 = _mm_set1_epi8(0x09u8 as i8);
321        let const_0x0d = _mm_set1_epi8(0x0Du8 as i8);
322        let const_0x20 = _mm_set1_epi8(0x20u8 as i8);
323        let const_0xa0 = _mm_set1_epi8(0xA0u8 as i8);
324
325        let mut line_acc = _mm_setzero_si128();
326        let mut batch = 0u32;
327
328        while i + 16 <= len {
329            let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
330            let is_nl = _mm_cmpeq_epi8(v, nl_byte);
331            line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
332
333            // Space check: byte in {0x09-0x0D, 0x20, 0xA0}
334            let ge_09 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x09), v);
335            let le_0d = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x0d), v);
336            let in_tab_range = _mm_and_si128(ge_09, le_0d);
337            let is_sp = _mm_cmpeq_epi8(v, const_0x20);
338            let is_nbsp = _mm_cmpeq_epi8(v, const_0xa0);
339            let is_space = _mm_or_si128(_mm_or_si128(in_tab_range, is_sp), is_nbsp);
340            let space_mask = (_mm_movemask_epi8(is_space) as u32) & 0xFFFF;
341
342            // 2-state word start detection
343            let nonspace_mask = !space_mask & 0xFFFF;
344            let prev_space = ((space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 }) & 0xFFFF;
345            let starts = nonspace_mask & prev_space;
346            total_words += starts.count_ones() as u64;
347
348            prev_in_word = (nonspace_mask >> 15) & 1 == 1;
349
350            batch += 1;
351            if batch >= 255 {
352                let sad = _mm_sad_epu8(line_acc, zero);
353                let hi = _mm_unpackhi_epi64(sad, sad);
354                let t = _mm_add_epi64(sad, hi);
355                total_lines += _mm_cvtsi128_si64(t) as u64;
356                line_acc = _mm_setzero_si128();
357                batch = 0;
358            }
359            i += 16;
360        }
361
362        if batch > 0 {
363            let sad = _mm_sad_epu8(line_acc, zero);
364            let hi = _mm_unpackhi_epi64(sad, sad);
365            let t = _mm_add_epi64(sad, hi);
366            total_lines += _mm_cvtsi128_si64(t) as u64;
367        }
368    }
369
370    count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
371}
372
373/// Dispatch to AVX2, SSE2, or scalar chunk counter.
374#[inline]
375fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
376    #[cfg(target_arch = "x86_64")]
377    {
378        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
379            return unsafe { count_lw_c_chunk_avx2(data) };
380        }
381        if data.len() >= 32 {
382            return unsafe { count_lw_c_chunk_sse2(data) };
383        }
384    }
385    count_lw_c_chunk(data)
386}
387
388/// Count words + lines in a C locale chunk using 2-state logic, returning
389/// counts plus boundary info for parallel chunk merging.
390/// Returns (line_count, word_count, first_is_word_content, ends_in_word).
391fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
392    let mut lines = 0u64;
393    let mut words = 0u64;
394    let mut in_word = false;
395    let mut i = 0;
396    let len = data.len();
397
398    let first_word = first_is_word(data);
399
400    while i < len {
401        let b = unsafe { *data.get_unchecked(i) };
402        if IS_SPACE[b as usize] {
403            if b == b'\n' {
404                lines += 1;
405            }
406            in_word = false;
407        } else if !in_word {
408            in_word = true;
409            words += 1;
410        }
411        i += 1;
412    }
413    (lines, words, first_word, in_word)
414}
415
416/// Count words in UTF-8 locale using 3-state logic matching GNU wc 9.4.
417///
418/// Handles:
419/// - ASCII spaces (0x09-0x0D, 0x20): word break
420/// - ASCII printable (0x21-0x7E): word content
421/// - ASCII non-printable, non-space (NUL, controls, DEL): transparent (no state change)
422/// - Valid UTF-8 multi-byte Unicode spaces (iswspace): word break
423/// - Non-breaking spaces (U+00A0, U+2007, U+202F, U+2060): word break (iswnbspace)
424/// - Valid UTF-8 printable non-space chars: word content
425/// - Non-printable Unicode (C1 controls, etc.): transparent
426/// - Invalid UTF-8 encoding errors: transparent (matches GNU wc 9.4 EILSEQ handling)
427fn count_words_utf8(data: &[u8]) -> u64 {
428    let mut words = 0u64;
429    let mut in_word = false;
430    let mut i = 0;
431    let len = data.len();
432
433    while i < len {
434        let b = unsafe { *data.get_unchecked(i) };
435
436        if b < 0x80 {
437            // ASCII byte — 3-state matching GNU wc 9.4:
438            // Space (0x09-0x0D, 0x20): word break
439            // Printable non-space (0x21-0x7E): word content
440            // Non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
441            if IS_SPACE[b as usize] {
442                in_word = false;
443            } else if b >= 0x21 && b <= 0x7E {
444                // Printable ASCII: word content
445                if !in_word {
446                    in_word = true;
447                    words += 1;
448                }
449            }
450            // else: non-printable, non-space → transparent (no state change)
451            i += 1;
452        } else if b < 0xC2 {
453            // Invalid UTF-8: bare continuation byte (0x80-0xBF) or overlong (0xC0-0xC1)
454            // Encoding error → transparent (matches GNU wc 9.4 EILSEQ handling)
455            i += 1;
456        } else if b < 0xE0 {
457            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
458                let cp = ((b as u32 & 0x1F) << 6)
459                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
460                if is_unicode_word_break(cp) {
461                    in_word = false;
462                } else if is_printable_unicode(cp) {
463                    if !in_word {
464                        in_word = true;
465                        words += 1;
466                    }
467                }
468                // else: non-printable, non-space → transparent
469                i += 2;
470            } else {
471                // Incomplete sequence → transparent (encoding error)
472                i += 1;
473            }
474        } else if b < 0xF0 {
475            if i + 2 < len
476                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
477                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
478            {
479                let cp = ((b as u32 & 0x0F) << 12)
480                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
481                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
482                if is_unicode_word_break(cp) {
483                    in_word = false;
484                } else if is_printable_unicode(cp) {
485                    if !in_word {
486                        in_word = true;
487                        words += 1;
488                    }
489                }
490                // else: non-printable, non-space → transparent
491                i += 3;
492            } else {
493                // Incomplete sequence → transparent (encoding error)
494                i += 1;
495            }
496        } else if b < 0xF5 {
497            if i + 3 < len
498                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
499                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
500                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
501            {
502                let cp = ((b as u32 & 0x07) << 18)
503                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
504                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
505                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
506                if is_unicode_word_break(cp) {
507                    in_word = false;
508                } else if is_printable_unicode(cp) {
509                    if !in_word {
510                        in_word = true;
511                        words += 1;
512                    }
513                }
514                // else: non-printable, non-space → transparent
515                i += 4;
516            } else {
517                // Incomplete sequence → transparent (encoding error)
518                i += 1;
519            }
520        } else {
521            // Invalid byte >= 0xF5 → transparent (encoding error)
522            i += 1;
523        }
524    }
525
526    words
527}
528
529/// Count lines and words using optimized strategies per locale.
530/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
531/// C locale: AVX2 SIMD fused counter when available, scalar fallback otherwise.
532pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
533    if utf8 {
534        count_lines_words_utf8_fused(data)
535    } else {
536        let (lines, words, _, _) = count_lw_c_chunk_fast(data);
537        (lines, words)
538    }
539}
540
541/// Fused lines+words counting in UTF-8 mode (single pass).
542/// Avoids separate memchr pass for newlines by counting them inline with words.
543/// Uses 3-state logic matching GNU wc 9.4:
544///   - Encoding errors are transparent (no state change, matching GNU wc EILSEQ)
545///   - ASCII non-printable, non-space bytes (NUL, controls) are transparent
546///   - Printable non-space chars are word content
547///   - Whitespace chars are word breaks
548fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
549    let mut lines = 0u64;
550    let mut words = 0u64;
551    let mut in_word = false;
552    let mut i = 0;
553    let len = data.len();
554
555    while i < len {
556        let b = unsafe { *data.get_unchecked(i) };
557
558        if b == b'\n' {
559            lines += 1;
560            in_word = false;
561            i += 1;
562        } else if b < 0x80 {
563            // ASCII byte — 3-state matching GNU wc 9.4:
564            // Space: word break. Printable (0x21-0x7E): word content.
565            // Non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent.
566            if IS_SPACE[b as usize] {
567                in_word = false;
568            } else if b >= 0x21 && b <= 0x7E {
569                if !in_word {
570                    in_word = true;
571                    words += 1;
572                }
573            }
574            // else: transparent
575            i += 1;
576        } else if b < 0xC2 {
577            // Invalid UTF-8 → transparent (encoding error)
578            i += 1;
579        } else if b < 0xE0 {
580            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
581                let cp = ((b as u32 & 0x1F) << 6)
582                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
583                if is_unicode_word_break(cp) {
584                    in_word = false;
585                } else if is_printable_unicode(cp) {
586                    if !in_word {
587                        in_word = true;
588                        words += 1;
589                    }
590                }
591                i += 2;
592            } else {
593                // Incomplete → transparent
594                i += 1;
595            }
596        } else if b < 0xF0 {
597            if i + 2 < len
598                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
599                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
600            {
601                let cp = ((b as u32 & 0x0F) << 12)
602                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
603                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
604                if is_unicode_word_break(cp) {
605                    in_word = false;
606                } else if is_printable_unicode(cp) {
607                    if !in_word {
608                        in_word = true;
609                        words += 1;
610                    }
611                }
612                i += 3;
613            } else {
614                // Incomplete → transparent
615                i += 1;
616            }
617        } else if b < 0xF5 {
618            if i + 3 < len
619                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
620                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
621                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
622            {
623                let cp = ((b as u32 & 0x07) << 18)
624                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
625                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
626                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
627                if is_unicode_word_break(cp) {
628                    in_word = false;
629                } else if is_printable_unicode(cp) {
630                    if !in_word {
631                        in_word = true;
632                        words += 1;
633                    }
634                }
635                i += 4;
636            } else {
637                // Incomplete → transparent
638                i += 1;
639            }
640        } else {
641            // Invalid byte >= 0xF5 → transparent
642            i += 1;
643        }
644    }
645
646    (lines, words)
647}
648
649/// Count lines, words, and chars using optimized strategies per locale.
650pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
651    if utf8 {
652        // Fused single-pass for lines+words, then fast char-counting pass
653        let (lines, words) = count_lines_words_utf8_fused(data);
654        let chars = count_chars_utf8(data);
655        (lines, words, chars)
656    } else {
657        // C locale: use optimized fused lines+words, chars = byte count
658        let (lines, words) = count_lines_words(data, false);
659        (lines, words, data.len() as u64)
660    }
661}
662
663/// Count UTF-8 characters by counting non-continuation bytes.
664/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
665/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
666///
667/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
668/// Falls back to 64-byte block processing with popcount on other architectures.
669pub fn count_chars_utf8(data: &[u8]) -> u64 {
670    #[cfg(target_arch = "x86_64")]
671    {
672        if is_x86_feature_detected!("avx2") {
673            return unsafe { count_chars_utf8_avx2(data) };
674        }
675    }
676    count_chars_utf8_scalar(data)
677}
678
679/// AVX2 SIMD character counter: counts non-continuation bytes using
680/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
681/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
682#[cfg(target_arch = "x86_64")]
683#[target_feature(enable = "avx2")]
684unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
685    unsafe {
686        use std::arch::x86_64::*;
687
688        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
689        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
690        let ones = _mm256_set1_epi8(1);
691        let zero = _mm256_setzero_si256();
692
693        let mut total = 0u64;
694        let len = data.len();
695        let ptr = data.as_ptr();
696        let mut i = 0;
697        let mut acc = _mm256_setzero_si256();
698        let mut batch = 0u32;
699
700        while i + 32 <= len {
701            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
702            let masked = _mm256_and_si256(v, mask_c0);
703            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
704            let non_cont = _mm256_andnot_si256(is_cont, ones);
705            acc = _mm256_add_epi8(acc, non_cont);
706
707            batch += 1;
708            if batch >= 255 {
709                // Horizontal sum via PSADBW: sum u8 differences against zero
710                let sad = _mm256_sad_epu8(acc, zero);
711                let hi = _mm256_extracti128_si256(sad, 1);
712                let lo = _mm256_castsi256_si128(sad);
713                let sum = _mm_add_epi64(lo, hi);
714                let hi64 = _mm_unpackhi_epi64(sum, sum);
715                let t = _mm_add_epi64(sum, hi64);
716                total += _mm_cvtsi128_si64(t) as u64;
717                acc = _mm256_setzero_si256();
718                batch = 0;
719            }
720            i += 32;
721        }
722
723        // Final horizontal sum
724        if batch > 0 {
725            let sad = _mm256_sad_epu8(acc, zero);
726            let hi = _mm256_extracti128_si256(sad, 1);
727            let lo = _mm256_castsi256_si128(sad);
728            let sum = _mm_add_epi64(lo, hi);
729            let hi64 = _mm_unpackhi_epi64(sum, sum);
730            let t = _mm_add_epi64(sum, hi64);
731            total += _mm_cvtsi128_si64(t) as u64;
732        }
733
734        while i < len {
735            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
736            i += 1;
737        }
738
739        total
740    }
741}
742
743/// Scalar fallback for count_chars_utf8.
744fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
745    let mut count = 0u64;
746    let chunks = data.chunks_exact(64);
747    let remainder = chunks.remainder();
748
749    for chunk in chunks {
750        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
751        let mut any_high = 0u8;
752        let mut i = 0;
753        while i + 8 <= 64 {
754            unsafe {
755                any_high |= *chunk.get_unchecked(i);
756                any_high |= *chunk.get_unchecked(i + 1);
757                any_high |= *chunk.get_unchecked(i + 2);
758                any_high |= *chunk.get_unchecked(i + 3);
759                any_high |= *chunk.get_unchecked(i + 4);
760                any_high |= *chunk.get_unchecked(i + 5);
761                any_high |= *chunk.get_unchecked(i + 6);
762                any_high |= *chunk.get_unchecked(i + 7);
763            }
764            i += 8;
765        }
766        if any_high < 0x80 {
767            count += 64;
768            continue;
769        }
770
771        let mut char_mask = 0u64;
772        i = 0;
773        while i + 7 < 64 {
774            unsafe {
775                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
776                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
777                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
778                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
779                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
780                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
781                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
782                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
783            }
784            i += 8;
785        }
786        count += char_mask.count_ones() as u64;
787    }
788
789    for &b in remainder {
790        count += ((b & 0xC0) != 0x80) as u64;
791    }
792    count
793}
794
795/// Count characters in C/POSIX locale (each byte is one character).
796#[inline]
797pub fn count_chars_c(data: &[u8]) -> u64 {
798    data.len() as u64
799}
800
801/// Count characters, choosing behavior based on locale.
802#[inline]
803pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
804    if utf8 {
805        count_chars_utf8(data)
806    } else {
807        count_chars_c(data)
808    }
809}
810
811/// Detect if the current locale uses UTF-8 encoding.
812pub fn is_utf8_locale() -> bool {
813    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
814        if let Ok(val) = std::env::var(var) {
815            if !val.is_empty() {
816                let lower = val.to_ascii_lowercase();
817                return lower.contains("utf-8") || lower.contains("utf8");
818            }
819        }
820    }
821    false
822}
823
824/// Decode one UTF-8 character from a byte slice.
825/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
826#[inline]
827fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
828    let b0 = bytes[0];
829    if b0 < 0x80 {
830        return (b0 as u32, 1);
831    }
832    if b0 < 0xC2 {
833        // Continuation byte or overlong 2-byte — invalid as start
834        return (b0 as u32, 1);
835    }
836    if b0 < 0xE0 {
837        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
838            return (b0 as u32, 1);
839        }
840        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
841        return (cp, 2);
842    }
843    if b0 < 0xF0 {
844        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
845            return (b0 as u32, 1);
846        }
847        let cp =
848            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
849        return (cp, 3);
850    }
851    if b0 < 0xF5 {
852        if bytes.len() < 4
853            || bytes[1] & 0xC0 != 0x80
854            || bytes[2] & 0xC0 != 0x80
855            || bytes[3] & 0xC0 != 0x80
856        {
857            return (b0 as u32, 1);
858        }
859        let cp = ((b0 as u32 & 0x07) << 18)
860            | ((bytes[1] as u32 & 0x3F) << 12)
861            | ((bytes[2] as u32 & 0x3F) << 6)
862            | (bytes[3] as u32 & 0x3F);
863        return (cp, 4);
864    }
865    (b0 as u32, 1)
866}
867
868/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
869/// GNU wc uses wcwidth() which returns 0 for these. We must match.
870#[inline]
871fn is_zero_width(cp: u32) -> bool {
872    matches!(
873        cp,
874        0x0300..=0x036F   // Combining Diacritical Marks
875        | 0x0483..=0x0489 // Cyrillic combining marks
876        | 0x0591..=0x05BD // Hebrew combining marks
877        | 0x05BF
878        | 0x05C1..=0x05C2
879        | 0x05C4..=0x05C5
880        | 0x05C7
881        | 0x0600..=0x0605 // Arabic number signs
882        | 0x0610..=0x061A // Arabic combining marks
883        | 0x064B..=0x065F // Arabic combining marks
884        | 0x0670
885        | 0x06D6..=0x06DD
886        | 0x06DF..=0x06E4
887        | 0x06E7..=0x06E8
888        | 0x06EA..=0x06ED
889        | 0x070F
890        | 0x0711
891        | 0x0730..=0x074A
892        | 0x07A6..=0x07B0
893        | 0x07EB..=0x07F3
894        | 0x07FD
895        | 0x0816..=0x0819
896        | 0x081B..=0x0823
897        | 0x0825..=0x0827
898        | 0x0829..=0x082D
899        | 0x0859..=0x085B
900        | 0x08D3..=0x08E1
901        | 0x08E3..=0x0902
902        | 0x093A
903        | 0x093C
904        | 0x0941..=0x0948
905        | 0x094D
906        | 0x0951..=0x0957
907        | 0x0962..=0x0963
908        | 0x0981
909        | 0x09BC
910        | 0x09C1..=0x09C4
911        | 0x09CD
912        | 0x09E2..=0x09E3
913        | 0x09FE
914        | 0x0A01..=0x0A02
915        | 0x0A3C
916        | 0x0A41..=0x0A42
917        | 0x0A47..=0x0A48
918        | 0x0A4B..=0x0A4D
919        | 0x0A51
920        | 0x0A70..=0x0A71
921        | 0x0A75
922        | 0x0A81..=0x0A82
923        | 0x0ABC
924        | 0x0AC1..=0x0AC5
925        | 0x0AC7..=0x0AC8
926        | 0x0ACD
927        | 0x0AE2..=0x0AE3
928        | 0x0AFA..=0x0AFF
929        | 0x0B01
930        | 0x0B3C
931        | 0x0B3F
932        | 0x0B41..=0x0B44
933        | 0x0B4D
934        | 0x0B56
935        | 0x0B62..=0x0B63
936        | 0x0B82
937        | 0x0BC0
938        | 0x0BCD
939        | 0x0C00
940        | 0x0C04
941        | 0x0C3E..=0x0C40
942        | 0x0C46..=0x0C48
943        | 0x0C4A..=0x0C4D
944        | 0x0C55..=0x0C56
945        | 0x0C62..=0x0C63
946        | 0x0C81
947        | 0x0CBC
948        | 0x0CBF
949        | 0x0CC6
950        | 0x0CCC..=0x0CCD
951        | 0x0CE2..=0x0CE3
952        | 0x0D00..=0x0D01
953        | 0x0D3B..=0x0D3C
954        | 0x0D41..=0x0D44
955        | 0x0D4D
956        | 0x0D62..=0x0D63
957        | 0x0DCA
958        | 0x0DD2..=0x0DD4
959        | 0x0DD6
960        | 0x0E31
961        | 0x0E34..=0x0E3A
962        | 0x0E47..=0x0E4E
963        | 0x0EB1
964        | 0x0EB4..=0x0EBC
965        | 0x0EC8..=0x0ECD
966        | 0x0F18..=0x0F19
967        | 0x0F35
968        | 0x0F37
969        | 0x0F39
970        | 0x0F71..=0x0F7E
971        | 0x0F80..=0x0F84
972        | 0x0F86..=0x0F87
973        | 0x0F8D..=0x0F97
974        | 0x0F99..=0x0FBC
975        | 0x0FC6
976        | 0x102D..=0x1030
977        | 0x1032..=0x1037
978        | 0x1039..=0x103A
979        | 0x103D..=0x103E
980        | 0x1058..=0x1059
981        | 0x105E..=0x1060
982        | 0x1071..=0x1074
983        | 0x1082
984        | 0x1085..=0x1086
985        | 0x108D
986        | 0x109D
987        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
988        | 0x135D..=0x135F
989        | 0x1712..=0x1714
990        | 0x1732..=0x1734
991        | 0x1752..=0x1753
992        | 0x1772..=0x1773
993        | 0x17B4..=0x17B5
994        | 0x17B7..=0x17BD
995        | 0x17C6
996        | 0x17C9..=0x17D3
997        | 0x17DD
998        | 0x180B..=0x180D
999        | 0x1885..=0x1886
1000        | 0x18A9
1001        | 0x1920..=0x1922
1002        | 0x1927..=0x1928
1003        | 0x1932
1004        | 0x1939..=0x193B
1005        | 0x1A17..=0x1A18
1006        | 0x1A1B
1007        | 0x1A56
1008        | 0x1A58..=0x1A5E
1009        | 0x1A60
1010        | 0x1A62
1011        | 0x1A65..=0x1A6C
1012        | 0x1A73..=0x1A7C
1013        | 0x1A7F
1014        | 0x1AB0..=0x1ABE
1015        | 0x1B00..=0x1B03
1016        | 0x1B34
1017        | 0x1B36..=0x1B3A
1018        | 0x1B3C
1019        | 0x1B42
1020        | 0x1B6B..=0x1B73
1021        | 0x1B80..=0x1B81
1022        | 0x1BA2..=0x1BA5
1023        | 0x1BA8..=0x1BA9
1024        | 0x1BAB..=0x1BAD
1025        | 0x1BE6
1026        | 0x1BE8..=0x1BE9
1027        | 0x1BED
1028        | 0x1BEF..=0x1BF1
1029        | 0x1C2C..=0x1C33
1030        | 0x1C36..=0x1C37
1031        | 0x1CD0..=0x1CD2
1032        | 0x1CD4..=0x1CE0
1033        | 0x1CE2..=0x1CE8
1034        | 0x1CED
1035        | 0x1CF4
1036        | 0x1CF8..=0x1CF9
1037        | 0x1DC0..=0x1DF9
1038        | 0x1DFB..=0x1DFF
1039        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
1040        | 0x202A..=0x202E // Bidi control chars
1041        | 0x2060..=0x2064 // Word joiner, invisible operators
1042        | 0x2066..=0x206F // Bidi isolates
1043        | 0x20D0..=0x20F0 // Combining marks for symbols
1044        | 0xFE00..=0xFE0F // Variation Selectors
1045        | 0xFE20..=0xFE2F // Combining Half Marks
1046        | 0xFEFF          // Zero Width No-Break Space (BOM)
1047        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
1048        | 0x1D167..=0x1D169
1049        | 0x1D173..=0x1D182
1050        | 0x1D185..=0x1D18B
1051        | 0x1D1AA..=0x1D1AD
1052        | 0x1D242..=0x1D244
1053        | 0xE0001
1054        | 0xE0020..=0xE007F
1055        | 0xE0100..=0xE01EF // Variation Selectors Supplement
1056    )
1057}
1058
1059/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
1060/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
1061#[inline]
1062fn is_wide_char(cp: u32) -> bool {
1063    matches!(
1064        cp,
1065        0x1100..=0x115F   // Hangul Jamo
1066        | 0x231A..=0x231B // Watch, Hourglass
1067        | 0x2329..=0x232A // Angle Brackets
1068        | 0x23E9..=0x23F3 // Various symbols
1069        | 0x23F8..=0x23FA
1070        | 0x25FD..=0x25FE
1071        | 0x2614..=0x2615
1072        | 0x2648..=0x2653
1073        | 0x267F
1074        | 0x2693
1075        | 0x26A1
1076        | 0x26AA..=0x26AB
1077        | 0x26BD..=0x26BE
1078        | 0x26C4..=0x26C5
1079        | 0x26CE
1080        | 0x26D4
1081        | 0x26EA
1082        | 0x26F2..=0x26F3
1083        | 0x26F5
1084        | 0x26FA
1085        | 0x26FD
1086        | 0x2702
1087        | 0x2705
1088        | 0x2708..=0x270D
1089        | 0x270F
1090        | 0x2712
1091        | 0x2714
1092        | 0x2716
1093        | 0x271D
1094        | 0x2721
1095        | 0x2728
1096        | 0x2733..=0x2734
1097        | 0x2744
1098        | 0x2747
1099        | 0x274C
1100        | 0x274E
1101        | 0x2753..=0x2755
1102        | 0x2757
1103        | 0x2763..=0x2764
1104        | 0x2795..=0x2797
1105        | 0x27A1
1106        | 0x27B0
1107        | 0x27BF
1108        | 0x2934..=0x2935
1109        | 0x2B05..=0x2B07
1110        | 0x2B1B..=0x2B1C
1111        | 0x2B50
1112        | 0x2B55
1113        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
1114        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
1115        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1116        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1117        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1118        | 0xAC00..=0xD7A3  // Hangul Syllables
1119        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1120        | 0xFE10..=0xFE19  // Vertical Forms
1121        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1122        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1123        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1124        | 0x1F004
1125        | 0x1F0CF
1126        | 0x1F170..=0x1F171
1127        | 0x1F17E..=0x1F17F
1128        | 0x1F18E
1129        | 0x1F191..=0x1F19A
1130        | 0x1F1E0..=0x1F1FF // Regional Indicators
1131        | 0x1F200..=0x1F202
1132        | 0x1F210..=0x1F23B
1133        | 0x1F240..=0x1F248
1134        | 0x1F250..=0x1F251
1135        | 0x1F260..=0x1F265
1136        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1137        | 0x1F680..=0x1F6FF // Transport Symbols
1138        | 0x1F900..=0x1F9FF // Supplemental Symbols
1139        | 0x1FA00..=0x1FA6F
1140        | 0x1FA70..=0x1FAFF
1141        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1142        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1143    )
1144}
1145
1146/// Compute maximum display width of any line (C/POSIX locale).
1147///
1148/// GNU wc -L behavior in C locale:
1149/// - `\n`: line terminator (records max, resets position)
1150/// - `\t`: advances to next tab stop (multiple of 8)
1151/// - `\r`: carriage return (resets position to 0, same line)
1152/// - `\f`: form feed (acts as line terminator like \n)
1153/// - Printable ASCII (0x20..0x7E): width 1
1154/// - Everything else (controls, high bytes): width 0
1155///
1156/// Optimized with printable ASCII run counting: for runs of bytes in
1157/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1158pub fn max_line_length_c(data: &[u8]) -> u64 {
1159    let mut max_len: u64 = 0;
1160    let mut line_len: u64 = 0;
1161    let mut linepos: u64 = 0;
1162    let mut i = 0;
1163    let len = data.len();
1164
1165    while i < len {
1166        let b = unsafe { *data.get_unchecked(i) };
1167        if b >= 0x21 && b <= 0x7E {
1168            // Printable non-space ASCII — count run length
1169            i += 1;
1170            let mut run = 1u64;
1171            while i < len {
1172                let b = unsafe { *data.get_unchecked(i) };
1173                if b >= 0x21 && b <= 0x7E {
1174                    run += 1;
1175                    i += 1;
1176                } else {
1177                    break;
1178                }
1179            }
1180            linepos += run;
1181            if linepos > line_len {
1182                line_len = linepos;
1183            }
1184        } else {
1185            match b {
1186                b' ' => {
1187                    linepos += 1;
1188                    if linepos > line_len {
1189                        line_len = linepos;
1190                    }
1191                }
1192                b'\n' => {
1193                    if line_len > max_len {
1194                        max_len = line_len;
1195                    }
1196                    linepos = 0;
1197                    line_len = 0;
1198                }
1199                b'\t' => {
1200                    linepos = (linepos + 8) & !7;
1201                    if linepos > line_len {
1202                        line_len = linepos;
1203                    }
1204                }
1205                b'\r' => {
1206                    linepos = 0;
1207                }
1208                0x0C => {
1209                    if line_len > max_len {
1210                        max_len = line_len;
1211                    }
1212                    linepos = 0;
1213                    line_len = 0;
1214                }
1215                _ => {} // Non-printable: width 0
1216            }
1217            i += 1;
1218        }
1219    }
1220
1221    if line_len > max_len {
1222        max_len = line_len;
1223    }
1224
1225    max_len
1226}
1227
1228/// Compute maximum display width of any line (UTF-8 locale).
1229///
1230/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1231/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1232///
1233/// Optimized with printable ASCII run counting for common text.
1234pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1235    let mut max_len: u64 = 0;
1236    let mut line_len: u64 = 0;
1237    let mut linepos: u64 = 0;
1238    let mut i = 0;
1239    let len = data.len();
1240
1241    while i < len {
1242        let b = unsafe { *data.get_unchecked(i) };
1243
1244        if b >= 0x21 && b <= 0x7E {
1245            // Printable non-space ASCII (most common) — count run length
1246            i += 1;
1247            let mut run = 1u64;
1248            while i < len {
1249                let b = unsafe { *data.get_unchecked(i) };
1250                if b >= 0x21 && b <= 0x7E {
1251                    run += 1;
1252                    i += 1;
1253                } else {
1254                    break;
1255                }
1256            }
1257            linepos += run;
1258            if linepos > line_len {
1259                line_len = linepos;
1260            }
1261        } else if b < 0x80 {
1262            // Other ASCII: space, tab, newline, controls
1263            match b {
1264                b' ' => {
1265                    linepos += 1;
1266                    if linepos > line_len {
1267                        line_len = linepos;
1268                    }
1269                }
1270                b'\n' => {
1271                    if line_len > max_len {
1272                        max_len = line_len;
1273                    }
1274                    linepos = 0;
1275                    line_len = 0;
1276                }
1277                b'\t' => {
1278                    linepos = (linepos + 8) & !7;
1279                    if linepos > line_len {
1280                        line_len = linepos;
1281                    }
1282                }
1283                b'\r' => {
1284                    linepos = 0;
1285                }
1286                0x0C => {
1287                    if line_len > max_len {
1288                        max_len = line_len;
1289                    }
1290                    linepos = 0;
1291                    line_len = 0;
1292                }
1293                _ => {} // Non-printable: width 0
1294            }
1295            i += 1;
1296        } else {
1297            // Multibyte UTF-8
1298            let (cp, blen) = decode_utf8(&data[i..]);
1299
1300            // C1 control characters (0x80..0x9F): non-printable, width 0
1301            if cp <= 0x9F {
1302                // width 0
1303            } else if is_zero_width(cp) {
1304                // Combining marks, zero-width chars: width 0
1305            } else if is_wide_char(cp) {
1306                linepos += 2;
1307                if linepos > line_len {
1308                    line_len = linepos;
1309                }
1310            } else {
1311                // Regular printable Unicode character: width 1
1312                linepos += 1;
1313                if linepos > line_len {
1314                    line_len = linepos;
1315                }
1316            }
1317            i += blen;
1318        }
1319    }
1320
1321    // Handle last line
1322    if line_len > max_len {
1323        max_len = line_len;
1324    }
1325
1326    max_len
1327}
1328
1329/// Compute maximum display width, choosing behavior based on locale.
1330#[inline]
1331pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1332    if utf8 {
1333        max_line_length_utf8(data)
1334    } else {
1335        max_line_length_c(data)
1336    }
1337}
1338
1339/// Count all metrics using optimized individual passes.
1340///
1341/// Each metric uses its own optimized algorithm:
1342/// - Lines: SIMD-accelerated memchr
1343/// - Words: 2-state scalar/state-machine (locale-dependent)
1344/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1345/// - Max line length: locale-aware display width tracking
1346///
1347/// Multi-pass is faster than single-pass because each pass has a tight,
1348/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1349/// making subsequent passes nearly free for memory bandwidth.
1350pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1351    if utf8 {
1352        let (lines, words) = count_lines_words_utf8_fused(data);
1353        WcCounts {
1354            lines,
1355            words,
1356            bytes: data.len() as u64,
1357            chars: count_chars_utf8(data),
1358            max_line_length: max_line_length_utf8(data),
1359        }
1360    } else {
1361        WcCounts {
1362            lines: count_lines(data),
1363            words: count_words_locale(data, false),
1364            bytes: data.len() as u64,
1365            chars: data.len() as u64,
1366            max_line_length: max_line_length_c(data),
1367        }
1368    }
1369}
1370
1371/// Quick check if data is likely all-ASCII by sampling three regions.
1372/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1373/// If any byte >= 0x80 is found, returns false.
1374#[inline]
1375fn check_ascii_sample(data: &[u8]) -> bool {
1376    let len = data.len();
1377    if len == 0 {
1378        return true;
1379    }
1380
1381    // Check in 8-byte blocks using OR-accumulation for speed
1382    let check_region = |start: usize, end: usize| -> bool {
1383        let mut or_acc = 0u8;
1384        let region = &data[start..end];
1385        let mut i = 0;
1386        while i + 8 <= region.len() {
1387            unsafe {
1388                or_acc |= *region.get_unchecked(i);
1389                or_acc |= *region.get_unchecked(i + 1);
1390                or_acc |= *region.get_unchecked(i + 2);
1391                or_acc |= *region.get_unchecked(i + 3);
1392                or_acc |= *region.get_unchecked(i + 4);
1393                or_acc |= *region.get_unchecked(i + 5);
1394                or_acc |= *region.get_unchecked(i + 6);
1395                or_acc |= *region.get_unchecked(i + 7);
1396            }
1397            i += 8;
1398        }
1399        while i < region.len() {
1400            or_acc |= region[i];
1401            i += 1;
1402        }
1403        or_acc < 0x80
1404    };
1405
1406    let sample = 256.min(len);
1407
1408    // Check beginning
1409    if !check_region(0, sample) {
1410        return false;
1411    }
1412    // Check middle
1413    if len > sample * 2 {
1414        let mid = len / 2;
1415        let mid_start = mid.saturating_sub(sample / 2);
1416        if !check_region(mid_start, (mid_start + sample).min(len)) {
1417            return false;
1418        }
1419    }
1420    // Check end
1421    if len > sample {
1422        if !check_region(len - sample, len) {
1423            return false;
1424        }
1425    }
1426
1427    true
1428}
1429
1430// ──────────────────────────────────────────────────
1431// Parallel counting for large files
1432// ──────────────────────────────────────────────────
1433
1434/// Split data into chunks at newline boundaries for parallel processing.
1435/// Returns slices where each slice (except possibly the last) ends with `\n`.
1436/// Splitting at newlines guarantees word boundaries in any locale,
1437/// enabling safe parallel word counting without boundary adjustment.
1438fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1439    if data.is_empty() || num_chunks <= 1 {
1440        return vec![data];
1441    }
1442    let chunk_size = data.len() / num_chunks;
1443    let mut chunks = Vec::with_capacity(num_chunks);
1444    let mut pos = 0;
1445
1446    for _ in 0..num_chunks - 1 {
1447        let target = pos + chunk_size;
1448        if target >= data.len() {
1449            break;
1450        }
1451        let boundary = memchr::memchr(b'\n', &data[target..])
1452            .map(|p| target + p + 1)
1453            .unwrap_or(data.len());
1454        if boundary > pos {
1455            chunks.push(&data[pos..boundary]);
1456        }
1457        pos = boundary;
1458    }
1459    if pos < data.len() {
1460        chunks.push(&data[pos..]);
1461    }
1462    chunks
1463}
1464
1465/// Count newlines in parallel using SIMD memchr + rayon.
1466/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1467pub fn count_lines_parallel(data: &[u8]) -> u64 {
1468    if data.len() < PARALLEL_THRESHOLD {
1469        return count_lines(data);
1470    }
1471
1472    let num_threads = rayon::current_num_threads().max(1);
1473    // Ensure chunks are large enough to amortize SIMD setup overhead
1474    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1475
1476    data.par_chunks(chunk_size)
1477        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1478        .sum()
1479}
1480
1481/// Count words in parallel with boundary adjustment.
1482pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1483    if data.len() < PARALLEL_THRESHOLD {
1484        return count_words_locale(data, utf8);
1485    }
1486
1487    let num_threads = rayon::current_num_threads().max(1);
1488
1489    if utf8 {
1490        // UTF-8: split at newline boundaries for safe parallel word counting.
1491        // Newlines are always word boundaries, so no boundary adjustment needed.
1492        let chunks = split_at_newlines(data, num_threads);
1493        chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1494    } else {
1495        // C locale: parallel 2-state word counting with boundary adjustment
1496        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1497
1498        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1499
1500        // Each chunk returns (lines, word_count, first_is_word, ends_in_word)
1501        let results: Vec<(u64, u64, bool, bool)> = chunks
1502            .par_iter()
1503            .map(|chunk| count_lw_c_chunk(chunk))
1504            .collect();
1505
1506        let mut total = 0u64;
1507        for i in 0..results.len() {
1508            total += results[i].1;
1509            // Boundary adjustment: if previous chunk ended in_word AND
1510            // current chunk's first byte is non-space (word content),
1511            // the word was split across chunks — subtract the overcount.
1512            if i > 0 && results[i - 1].3 && results[i].2 {
1513                total -= 1;
1514            }
1515        }
1516        total
1517    }
1518}
1519
1520/// Count UTF-8 characters in parallel.
1521pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1522    if !utf8 {
1523        return data.len() as u64;
1524    }
1525    if data.len() < PARALLEL_THRESHOLD {
1526        return count_chars_utf8(data);
1527    }
1528
1529    let num_threads = rayon::current_num_threads().max(1);
1530    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1531
1532    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1533}
1534
1535/// Count lines + words + bytes in a single fused pass (the default wc mode).
1536/// Avoids separate passes entirely — combines newline counting with word detection.
1537pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1538    let (lines, words) = count_lines_words(data, utf8);
1539    (lines, words, data.len() as u64)
1540}
1541
1542/// Parallel counting of lines + words + bytes only (no chars).
1543/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1544/// C locale: single fused pass per chunk counts BOTH lines and words.
1545/// UTF-8: checks ASCII first for C locale fast path, else splits at newlines
1546/// for safe parallel UTF-8 word counting.
1547pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1548    if data.len() < PARALLEL_THRESHOLD {
1549        // Small file: use fused single-pass
1550        return count_lwb(data, utf8);
1551    }
1552
1553    let num_threads = rayon::current_num_threads().max(1);
1554
1555    let (lines, words) = if !utf8 {
1556        // C locale: FUSED parallel lines+words counting — single pass per chunk
1557        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1558
1559        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1560        let results: Vec<(u64, u64, bool, bool)> = chunks
1561            .par_iter()
1562            .map(|chunk| count_lw_c_chunk_fast(chunk))
1563            .collect();
1564
1565        let mut line_total = 0u64;
1566        let mut word_total = 0u64;
1567        for i in 0..results.len() {
1568            line_total += results[i].0;
1569            word_total += results[i].1;
1570            if i > 0 && results[i - 1].3 && results[i].2 {
1571                word_total -= 1;
1572            }
1573        }
1574
1575        (line_total, word_total)
1576    } else {
1577        // UTF-8 locale: check if ASCII for faster C locale path
1578        let is_ascii = check_ascii_sample(data);
1579        if is_ascii {
1580            // Pure ASCII: use C locale parallel path (arbitrary chunks OK)
1581            let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1582            let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1583            let results: Vec<(u64, u64, bool, bool)> = chunks
1584                .par_iter()
1585                .map(|chunk| count_lw_c_chunk_fast(chunk))
1586                .collect();
1587
1588            let mut line_total = 0u64;
1589            let mut word_total = 0u64;
1590            for i in 0..results.len() {
1591                line_total += results[i].0;
1592                word_total += results[i].1;
1593                if i > 0 && results[i - 1].3 && results[i].2 {
1594                    word_total -= 1;
1595                }
1596            }
1597            (line_total, word_total)
1598        } else {
1599            // Non-ASCII UTF-8: split at newline boundaries for safe parallel
1600            // word counting. Newlines always break words, so no adjustment needed.
1601            let chunks = split_at_newlines(data, num_threads);
1602            let results: Vec<(u64, u64)> = chunks
1603                .par_iter()
1604                .map(|chunk| count_lines_words_utf8_fused(chunk))
1605                .collect();
1606            let mut line_total = 0u64;
1607            let mut word_total = 0u64;
1608            for (l, w) in results {
1609                line_total += l;
1610                word_total += w;
1611            }
1612            (line_total, word_total)
1613        }
1614    };
1615
1616    (lines, words, data.len() as u64)
1617}
1618
1619/// Combined parallel counting of lines + words + chars.
1620/// UTF-8: splits at newline boundaries for fused lines+words+chars per chunk.
1621/// C locale: fused parallel lines+words with boundary adjustment + parallel chars.
1622pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1623    if data.len() < PARALLEL_THRESHOLD {
1624        let lines = count_lines(data);
1625        let words = count_words_locale(data, utf8);
1626        let chars = count_chars(data, utf8);
1627        return (lines, words, chars);
1628    }
1629
1630    let num_threads = rayon::current_num_threads().max(1);
1631
1632    if utf8 {
1633        // UTF-8: fused parallel lines+words+chars per chunk (split at newlines)
1634        let chunks = split_at_newlines(data, num_threads);
1635        let results: Vec<(u64, u64, u64)> = chunks
1636            .par_iter()
1637            .map(|chunk| {
1638                let (lines, words) = count_lines_words_utf8_fused(chunk);
1639                let chars = count_chars_utf8(chunk);
1640                (lines, words, chars)
1641            })
1642            .collect();
1643        let mut lines = 0u64;
1644        let mut words = 0u64;
1645        let mut chars = 0u64;
1646        for (l, w, c) in results {
1647            lines += l;
1648            words += w;
1649            chars += c;
1650        }
1651        (lines, words, chars)
1652    } else {
1653        // C locale: fused parallel lines+words + parallel chars (= byte count)
1654        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1655        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1656        let results: Vec<(u64, u64, bool, bool)> = chunks
1657            .par_iter()
1658            .map(|chunk| count_lw_c_chunk_fast(chunk))
1659            .collect();
1660        let mut lines = 0u64;
1661        let mut words = 0u64;
1662        for i in 0..results.len() {
1663            lines += results[i].0;
1664            words += results[i].1;
1665            if i > 0 && results[i - 1].3 && results[i].2 {
1666                words -= 1;
1667            }
1668        }
1669        (lines, words, data.len() as u64)
1670    }
1671}
1672
1673/// Parallel max line length computation.
1674/// Splits at newline boundaries so each chunk independently computes correct
1675/// max line width (since newlines reset position tracking).
1676pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1677    if data.len() < PARALLEL_THRESHOLD {
1678        return max_line_length(data, utf8);
1679    }
1680    let num_threads = rayon::current_num_threads().max(1);
1681    let chunks = split_at_newlines(data, num_threads);
1682    chunks
1683        .par_iter()
1684        .map(|chunk| {
1685            if utf8 {
1686                max_line_length_utf8(chunk)
1687            } else {
1688                max_line_length_c(chunk)
1689            }
1690        })
1691        .max()
1692        .unwrap_or(0)
1693}
1694
1695/// Parallel counting of all metrics at once.
1696/// Splits at newline boundaries for safe parallel word + max_line_length counting.
1697/// Each chunk computes all metrics in a single traversal group, maximizing cache reuse.
1698pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1699    if data.len() < PARALLEL_THRESHOLD {
1700        return count_all(data, utf8);
1701    }
1702
1703    let num_threads = rayon::current_num_threads().max(1);
1704    let chunks = split_at_newlines(data, num_threads);
1705
1706    if utf8 {
1707        let results: Vec<(u64, u64, u64, u64)> = chunks
1708            .par_iter()
1709            .map(|chunk| {
1710                let (lines, words) = count_lines_words_utf8_fused(chunk);
1711                let chars = count_chars_utf8(chunk);
1712                let max_ll = max_line_length_utf8(chunk);
1713                (lines, words, chars, max_ll)
1714            })
1715            .collect();
1716
1717        let mut counts = WcCounts {
1718            bytes: data.len() as u64,
1719            ..Default::default()
1720        };
1721        for (l, w, c, m) in results {
1722            counts.lines += l;
1723            counts.words += w;
1724            counts.chars += c;
1725            if m > counts.max_line_length {
1726                counts.max_line_length = m;
1727            }
1728        }
1729        counts
1730    } else {
1731        // C locale: fused lines+words per chunk + max_line_length per chunk
1732        let results: Vec<(u64, u64, u64)> = chunks
1733            .par_iter()
1734            .map(|chunk| {
1735                let (lines, words) = count_lines_words(chunk, false);
1736                let max_ll = max_line_length_c(chunk);
1737                (lines, words, max_ll)
1738            })
1739            .collect();
1740
1741        let mut counts = WcCounts {
1742            bytes: data.len() as u64,
1743            chars: data.len() as u64,
1744            ..Default::default()
1745        };
1746        for (l, w, m) in &results {
1747            counts.lines += l;
1748            counts.words += w;
1749            if *m > counts.max_line_length {
1750                counts.max_line_length = *m;
1751            }
1752        }
1753        counts
1754    }
1755}