Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (2MB).
5/// Rayon overhead is ~5-10μs per task; at 2MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~200μs, so overhead is < 5%.
7const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// 3-state byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// GNU wc uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
24//   0 = printable (word content): starts or continues a word
25//   1 = space (word break): ends any current word
26//   2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
27//
28// The critical difference from 2-state is that transparent characters
29// (NUL, control chars, invalid UTF-8) do NOT break words.
30// Example: "hello\x00world" is 1 word (NUL is transparent), not 2.
31
32/// 3-state byte classification for C/POSIX locale.
33/// In C locale, mbrtowc() fails for bytes >= 0x80, making them transparent.
34/// Only printable ASCII (0x21-0x7E) forms words.
35const fn make_byte_class_c() -> [u8; 256] {
36    let mut t = [2u8; 256]; // default: transparent
37    // Spaces: iswspace() returns true
38    t[0x09] = 1; // \t
39    t[0x0A] = 1; // \n
40    t[0x0B] = 1; // \v
41    t[0x0C] = 1; // \f
42    t[0x0D] = 1; // \r
43    t[0x20] = 1; // space
44    // GNU compat: null byte is treated as printable (word content) in C locale.
45    // mbrtowc() returns L'\0' for the null byte, and GNU wc treats it as
46    // a non-space printable character that starts/continues words.
47    t[0x00] = 0;
48    // Printable ASCII (0x21-0x7E): word content
49    let mut i = 0x21u16;
50    while i <= 0x7E {
51        t[i as usize] = 0;
52        i += 1;
53    }
54    t
55}
56
57const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
58
59/// 3-state single-byte classification for UTF-8 locale.
60/// Multi-byte UTF-8 sequences are handled by the state machine separately.
61const fn make_byte_class_utf8() -> [u8; 256] {
62    let mut t = [2u8; 256]; // default: transparent
63    // Spaces
64    t[0x09] = 1; // \t
65    t[0x0A] = 1; // \n
66    t[0x0B] = 1; // \v
67    t[0x0C] = 1; // \f
68    t[0x0D] = 1; // \r
69    t[0x20] = 1; // space
70    // Printable ASCII (0x21-0x7E): word content
71    let mut i = 0x21u16;
72    while i <= 0x7E {
73        t[i as usize] = 0;
74        i += 1;
75    }
76    t
77}
78
79const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
80
81// ──────────────────────────────────────────────────
82// Unicode character classification helpers
83// ──────────────────────────────────────────────────
84
85/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
86/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
87#[inline]
88fn is_unicode_space(cp: u32) -> bool {
89    matches!(
90        cp,
91        0x00A0 |           // No-Break Space
92        0x1680 |           // Ogham Space Mark
93        0x2000
94            ..=0x200A |  // En Quad through Hair Space
95        0x2028 |           // Line Separator
96        0x2029 |           // Paragraph Separator
97        0x202F |           // Narrow No-Break Space
98        0x205F |           // Medium Mathematical Space
99        0x3000 // Ideographic Space
100    )
101}
102
103/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
104/// C1 control characters (U+0080-U+009F) are not printable.
105/// Most characters >= U+00A0 are printable.
106#[inline]
107fn is_unicode_printable(cp: u32) -> bool {
108    cp >= 0xA0
109}
110
111// ──────────────────────────────────────────────────
112// Core counting functions
113// ──────────────────────────────────────────────────
114
115/// Count newlines using SIMD-accelerated memchr.
116/// GNU wc counts newline bytes (`\n`), not logical lines.
117#[inline]
118pub fn count_lines(data: &[u8]) -> u64 {
119    memchr_iter(b'\n', data).count() as u64
120}
121
122/// Count bytes. Trivial but included for API consistency.
123#[inline]
124pub fn count_bytes(data: &[u8]) -> u64 {
125    data.len() as u64
126}
127
128/// Count words using locale-aware 3-state logic (default: UTF-8).
129pub fn count_words(data: &[u8]) -> u64 {
130    count_words_locale(data, true)
131}
132
133/// Count words with explicit locale control using 3-state logic.
134///
135/// GNU wc classifies each character as:
136///   - space (iswspace=true): sets in_word=false
137///   - printable (iswprint=true): sets in_word=true, increments word count on transition
138///   - transparent (neither): leaves in_word unchanged
139pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
140    if utf8 {
141        count_words_utf8(data)
142    } else {
143        count_words_c(data)
144    }
145}
146
147/// Count words in C/POSIX locale using 3-state scalar logic.
148/// Only printable ASCII (0x21-0x7E) forms words.
149/// Bytes >= 0x80 and non-printable ASCII controls are transparent.
150///
151/// Optimized with ASCII run skipping for printable characters.
152fn count_words_c(data: &[u8]) -> u64 {
153    let mut words = 0u64;
154    let mut in_word = false;
155    let mut i = 0;
156    let len = data.len();
157
158    while i < len {
159        let b = unsafe { *data.get_unchecked(i) };
160        if b >= 0x21 && b <= 0x7E {
161            // Printable ASCII — word content
162            if !in_word {
163                in_word = true;
164                words += 1;
165            }
166            i += 1;
167            // Skip remaining printable ASCII
168            while i < len {
169                let b = unsafe { *data.get_unchecked(i) };
170                if b >= 0x21 && b <= 0x7E {
171                    i += 1;
172                } else {
173                    break;
174                }
175            }
176        } else {
177            let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
178            if class == 1 {
179                in_word = false;
180            } else if class == 0 {
181                // NUL is printable in C locale — starts/continues word
182                if !in_word {
183                    in_word = true;
184                    words += 1;
185                }
186            }
187            // class == 2: transparent — in_word unchanged
188            i += 1;
189        }
190    }
191    words
192}
193
194/// Count words + lines in a C locale chunk, returning counts plus boundary info.
195/// Used by parallel word counting.
196/// Returns (line_count, word_count, first_active_is_printable, ends_in_word).
197fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
198    let mut lines = 0u64;
199    let mut words = 0u64;
200    let mut in_word = false;
201    let mut first_active_is_printable = false;
202    let mut seen_active = false;
203    let mut i = 0;
204    let len = data.len();
205
206    while i < len {
207        let b = unsafe { *data.get_unchecked(i) };
208        if b >= 0x21 && b <= 0x7E {
209            // Printable ASCII
210            if !seen_active {
211                seen_active = true;
212                first_active_is_printable = true;
213            }
214            if !in_word {
215                in_word = true;
216                words += 1;
217            }
218            i += 1;
219            // Skip remaining printable ASCII
220            while i < len {
221                let b = unsafe { *data.get_unchecked(i) };
222                if b >= 0x21 && b <= 0x7E {
223                    i += 1;
224                } else {
225                    break;
226                }
227            }
228        } else if b == b'\n' {
229            lines += 1;
230            if !seen_active {
231                seen_active = true;
232            }
233            in_word = false;
234            i += 1;
235        } else {
236            let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
237            if class == 1 {
238                if !seen_active {
239                    seen_active = true;
240                }
241                in_word = false;
242            } else if class == 0 {
243                // NUL is printable in C locale — starts/continues word
244                if !seen_active {
245                    seen_active = true;
246                    first_active_is_printable = true;
247                }
248                if !in_word {
249                    in_word = true;
250                    words += 1;
251                }
252            }
253            i += 1;
254        }
255    }
256    (lines, words, first_active_is_printable, in_word)
257}
258
259/// Count words in UTF-8 locale using a state machine with 3-state logic.
260///
261/// Handles:
262/// - ASCII spaces (0x09-0x0D, 0x20): word break
263/// - ASCII printable (0x21-0x7E): word content
264/// - ASCII non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
265/// - Valid UTF-8 multi-byte → check Unicode space/printable
266/// - Invalid UTF-8: transparent (GNU wc skips invalid bytes without changing state)
267///
268/// Optimized with ASCII run skipping: when a word starts, skips remaining
269/// printable ASCII bytes without per-byte table lookups (~4x fewer state checks
270/// for English text with 5-char average word length).
271fn count_words_utf8(data: &[u8]) -> u64 {
272    let mut words = 0u64;
273    let mut in_word = false;
274    let mut i = 0;
275    let len = data.len();
276
277    while i < len {
278        let b = unsafe { *data.get_unchecked(i) };
279
280        if b >= 0x21 && b <= 0x7E {
281            // Printable ASCII (most common case for text) — word content
282            if !in_word {
283                in_word = true;
284                words += 1;
285            }
286            i += 1;
287            // Skip remaining printable ASCII (they don't change state)
288            while i < len {
289                let b = unsafe { *data.get_unchecked(i) };
290                if b >= 0x21 && b <= 0x7E {
291                    i += 1;
292                } else {
293                    break;
294                }
295            }
296        } else if b < 0x80 {
297            // Non-printable ASCII: space/tab/newline/controls
298            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
299            if class == 1 {
300                in_word = false;
301            }
302            // class == 2: transparent (controls 0x00-0x08, 0x0E-0x1F, 0x7F)
303            i += 1;
304        } else if b < 0xC2 {
305            i += 1;
306        } else if b < 0xE0 {
307            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
308                let cp = ((b as u32 & 0x1F) << 6)
309                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
310                if is_unicode_space(cp) {
311                    in_word = false;
312                } else if is_unicode_printable(cp) {
313                    if !in_word {
314                        in_word = true;
315                        words += 1;
316                    }
317                }
318                i += 2;
319            } else {
320                i += 1;
321            }
322        } else if b < 0xF0 {
323            if i + 2 < len
324                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
325                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
326            {
327                let cp = ((b as u32 & 0x0F) << 12)
328                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
329                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
330                if is_unicode_space(cp) {
331                    in_word = false;
332                } else if is_unicode_printable(cp) {
333                    if !in_word {
334                        in_word = true;
335                        words += 1;
336                    }
337                }
338                i += 3;
339            } else {
340                i += 1;
341            }
342        } else if b < 0xF5 {
343            if i + 3 < len
344                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
345                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
346                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
347            {
348                let cp = ((b as u32 & 0x07) << 18)
349                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
350                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
351                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
352                if is_unicode_space(cp) {
353                    in_word = false;
354                } else if is_unicode_printable(cp) {
355                    if !in_word {
356                        in_word = true;
357                        words += 1;
358                    }
359                }
360                i += 4;
361            } else {
362                i += 1;
363            }
364        } else {
365            i += 1;
366        }
367    }
368
369    words
370}
371
372/// Count lines and words using optimized strategies per locale.
373/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
374/// C locale: single scalar pass with 3-state logic and ASCII run skipping.
375pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
376    if utf8 {
377        count_lines_words_utf8_fused(data)
378    } else {
379        let mut lines = 0u64;
380        let mut words = 0u64;
381        let mut in_word = false;
382        let mut i = 0;
383        let len = data.len();
384
385        while i < len {
386            let b = unsafe { *data.get_unchecked(i) };
387            if b >= 0x21 && b <= 0x7E {
388                // Printable ASCII — word content
389                if !in_word {
390                    in_word = true;
391                    words += 1;
392                }
393                i += 1;
394                while i < len {
395                    let b = unsafe { *data.get_unchecked(i) };
396                    if b >= 0x21 && b <= 0x7E {
397                        i += 1;
398                    } else {
399                        break;
400                    }
401                }
402            } else if b == b'\n' {
403                lines += 1;
404                in_word = false;
405                i += 1;
406            } else {
407                let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
408                if class == 1 {
409                    in_word = false;
410                }
411                i += 1;
412            }
413        }
414        (lines, words)
415    }
416}
417
418/// Fused lines+words counting in UTF-8 mode (single pass).
419/// Avoids separate memchr pass for newlines by counting them inline with words.
420///
421/// Key optimization: ASCII run skipping. Once a word starts (printable ASCII byte),
422/// we skip remaining printable ASCII bytes without any per-byte state checks.
423/// For English text (avg word ~5 chars), this reduces state transitions by ~4x.
424fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
425    let mut lines = 0u64;
426    let mut words = 0u64;
427    let mut in_word = false;
428    let mut i = 0;
429    let len = data.len();
430
431    while i < len {
432        let b = unsafe { *data.get_unchecked(i) };
433
434        if b >= 0x21 && b <= 0x7E {
435            // Printable ASCII (most common) — word content
436            if !in_word {
437                in_word = true;
438                words += 1;
439            }
440            i += 1;
441            // Skip remaining printable ASCII (they don't change state or count lines)
442            while i < len {
443                let b = unsafe { *data.get_unchecked(i) };
444                if b >= 0x21 && b <= 0x7E {
445                    i += 1;
446                } else {
447                    break;
448                }
449            }
450        } else if b == b'\n' {
451            lines += 1;
452            in_word = false;
453            i += 1;
454        } else if b == b' ' {
455            in_word = false;
456            i += 1;
457        } else if b < 0x80 {
458            // Other ASCII: \t, \r, \v, \f, controls
459            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
460            if class == 1 {
461                in_word = false;
462            }
463            // class == 2: transparent
464            i += 1;
465        } else if b < 0xC2 {
466            i += 1;
467        } else if b < 0xE0 {
468            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
469                let cp = ((b as u32 & 0x1F) << 6)
470                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
471                if is_unicode_space(cp) {
472                    in_word = false;
473                } else if is_unicode_printable(cp) {
474                    if !in_word {
475                        in_word = true;
476                        words += 1;
477                    }
478                }
479                i += 2;
480            } else {
481                i += 1;
482            }
483        } else if b < 0xF0 {
484            if i + 2 < len
485                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
486                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
487            {
488                let cp = ((b as u32 & 0x0F) << 12)
489                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
490                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
491                if is_unicode_space(cp) {
492                    in_word = false;
493                } else if is_unicode_printable(cp) {
494                    if !in_word {
495                        in_word = true;
496                        words += 1;
497                    }
498                }
499                i += 3;
500            } else {
501                i += 1;
502            }
503        } else if b < 0xF5 {
504            if i + 3 < len
505                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
506                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
507                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
508            {
509                let cp = ((b as u32 & 0x07) << 18)
510                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
511                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
512                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
513                if is_unicode_space(cp) {
514                    in_word = false;
515                } else if is_unicode_printable(cp) {
516                    if !in_word {
517                        in_word = true;
518                        words += 1;
519                    }
520                }
521                i += 4;
522            } else {
523                i += 1;
524            }
525        } else {
526            i += 1;
527        }
528    }
529
530    (lines, words)
531}
532
533/// Count lines, words, and chars using optimized strategies per locale.
534pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
535    if utf8 {
536        // Fused single-pass for lines+words, then fast char-counting pass
537        let (lines, words) = count_lines_words_utf8_fused(data);
538        let chars = count_chars_utf8(data);
539        (lines, words, chars)
540    } else {
541        // C locale: use optimized fused lines+words, chars = byte count
542        let (lines, words) = count_lines_words(data, false);
543        (lines, words, data.len() as u64)
544    }
545}
546
547/// Count UTF-8 characters by counting non-continuation bytes.
548/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
549/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
550///
551/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
552/// Falls back to 64-byte block processing with popcount on other architectures.
553pub fn count_chars_utf8(data: &[u8]) -> u64 {
554    #[cfg(target_arch = "x86_64")]
555    {
556        if is_x86_feature_detected!("avx2") {
557            return unsafe { count_chars_utf8_avx2(data) };
558        }
559    }
560    count_chars_utf8_scalar(data)
561}
562
563/// AVX2 SIMD character counter: counts non-continuation bytes using
564/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
565/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
566#[cfg(target_arch = "x86_64")]
567#[target_feature(enable = "avx2")]
568unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
569    unsafe {
570        use std::arch::x86_64::*;
571
572        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
573        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
574        let ones = _mm256_set1_epi8(1);
575        let zero = _mm256_setzero_si256();
576
577        let mut total = 0u64;
578        let len = data.len();
579        let ptr = data.as_ptr();
580        let mut i = 0;
581        let mut acc = _mm256_setzero_si256();
582        let mut batch = 0u32;
583
584        while i + 32 <= len {
585            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
586            let masked = _mm256_and_si256(v, mask_c0);
587            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
588            let non_cont = _mm256_andnot_si256(is_cont, ones);
589            acc = _mm256_add_epi8(acc, non_cont);
590
591            batch += 1;
592            if batch >= 255 {
593                // Horizontal sum via PSADBW: sum u8 differences against zero
594                let sad = _mm256_sad_epu8(acc, zero);
595                let hi = _mm256_extracti128_si256(sad, 1);
596                let lo = _mm256_castsi256_si128(sad);
597                let sum = _mm_add_epi64(lo, hi);
598                let hi64 = _mm_unpackhi_epi64(sum, sum);
599                let t = _mm_add_epi64(sum, hi64);
600                total += _mm_cvtsi128_si64(t) as u64;
601                acc = _mm256_setzero_si256();
602                batch = 0;
603            }
604            i += 32;
605        }
606
607        // Final horizontal sum
608        if batch > 0 {
609            let sad = _mm256_sad_epu8(acc, zero);
610            let hi = _mm256_extracti128_si256(sad, 1);
611            let lo = _mm256_castsi256_si128(sad);
612            let sum = _mm_add_epi64(lo, hi);
613            let hi64 = _mm_unpackhi_epi64(sum, sum);
614            let t = _mm_add_epi64(sum, hi64);
615            total += _mm_cvtsi128_si64(t) as u64;
616        }
617
618        while i < len {
619            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
620            i += 1;
621        }
622
623        total
624    }
625}
626
627/// Scalar fallback for count_chars_utf8.
628fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
629    let mut count = 0u64;
630    let chunks = data.chunks_exact(64);
631    let remainder = chunks.remainder();
632
633    for chunk in chunks {
634        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
635        let mut any_high = 0u8;
636        let mut i = 0;
637        while i + 8 <= 64 {
638            unsafe {
639                any_high |= *chunk.get_unchecked(i);
640                any_high |= *chunk.get_unchecked(i + 1);
641                any_high |= *chunk.get_unchecked(i + 2);
642                any_high |= *chunk.get_unchecked(i + 3);
643                any_high |= *chunk.get_unchecked(i + 4);
644                any_high |= *chunk.get_unchecked(i + 5);
645                any_high |= *chunk.get_unchecked(i + 6);
646                any_high |= *chunk.get_unchecked(i + 7);
647            }
648            i += 8;
649        }
650        if any_high < 0x80 {
651            count += 64;
652            continue;
653        }
654
655        let mut char_mask = 0u64;
656        i = 0;
657        while i + 7 < 64 {
658            unsafe {
659                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
660                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
661                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
662                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
663                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
664                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
665                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
666                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
667            }
668            i += 8;
669        }
670        count += char_mask.count_ones() as u64;
671    }
672
673    for &b in remainder {
674        count += ((b & 0xC0) != 0x80) as u64;
675    }
676    count
677}
678
679/// Count characters in C/POSIX locale (each byte is one character).
680#[inline]
681pub fn count_chars_c(data: &[u8]) -> u64 {
682    data.len() as u64
683}
684
685/// Count characters, choosing behavior based on locale.
686#[inline]
687pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
688    if utf8 {
689        count_chars_utf8(data)
690    } else {
691        count_chars_c(data)
692    }
693}
694
695/// Detect if the current locale uses UTF-8 encoding.
696pub fn is_utf8_locale() -> bool {
697    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
698        if let Ok(val) = std::env::var(var) {
699            if !val.is_empty() {
700                let lower = val.to_ascii_lowercase();
701                return lower.contains("utf-8") || lower.contains("utf8");
702            }
703        }
704    }
705    false
706}
707
708/// Decode one UTF-8 character from a byte slice.
709/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
710#[inline]
711fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
712    let b0 = bytes[0];
713    if b0 < 0x80 {
714        return (b0 as u32, 1);
715    }
716    if b0 < 0xC2 {
717        // Continuation byte or overlong 2-byte — invalid as start
718        return (b0 as u32, 1);
719    }
720    if b0 < 0xE0 {
721        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
722            return (b0 as u32, 1);
723        }
724        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
725        return (cp, 2);
726    }
727    if b0 < 0xF0 {
728        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
729            return (b0 as u32, 1);
730        }
731        let cp =
732            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
733        return (cp, 3);
734    }
735    if b0 < 0xF5 {
736        if bytes.len() < 4
737            || bytes[1] & 0xC0 != 0x80
738            || bytes[2] & 0xC0 != 0x80
739            || bytes[3] & 0xC0 != 0x80
740        {
741            return (b0 as u32, 1);
742        }
743        let cp = ((b0 as u32 & 0x07) << 18)
744            | ((bytes[1] as u32 & 0x3F) << 12)
745            | ((bytes[2] as u32 & 0x3F) << 6)
746            | (bytes[3] as u32 & 0x3F);
747        return (cp, 4);
748    }
749    (b0 as u32, 1)
750}
751
752/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
753/// GNU wc uses wcwidth() which returns 0 for these. We must match.
754#[inline]
755fn is_zero_width(cp: u32) -> bool {
756    matches!(
757        cp,
758        0x0300..=0x036F   // Combining Diacritical Marks
759        | 0x0483..=0x0489 // Cyrillic combining marks
760        | 0x0591..=0x05BD // Hebrew combining marks
761        | 0x05BF
762        | 0x05C1..=0x05C2
763        | 0x05C4..=0x05C5
764        | 0x05C7
765        | 0x0600..=0x0605 // Arabic number signs
766        | 0x0610..=0x061A // Arabic combining marks
767        | 0x064B..=0x065F // Arabic combining marks
768        | 0x0670
769        | 0x06D6..=0x06DD
770        | 0x06DF..=0x06E4
771        | 0x06E7..=0x06E8
772        | 0x06EA..=0x06ED
773        | 0x070F
774        | 0x0711
775        | 0x0730..=0x074A
776        | 0x07A6..=0x07B0
777        | 0x07EB..=0x07F3
778        | 0x07FD
779        | 0x0816..=0x0819
780        | 0x081B..=0x0823
781        | 0x0825..=0x0827
782        | 0x0829..=0x082D
783        | 0x0859..=0x085B
784        | 0x08D3..=0x08E1
785        | 0x08E3..=0x0902
786        | 0x093A
787        | 0x093C
788        | 0x0941..=0x0948
789        | 0x094D
790        | 0x0951..=0x0957
791        | 0x0962..=0x0963
792        | 0x0981
793        | 0x09BC
794        | 0x09C1..=0x09C4
795        | 0x09CD
796        | 0x09E2..=0x09E3
797        | 0x09FE
798        | 0x0A01..=0x0A02
799        | 0x0A3C
800        | 0x0A41..=0x0A42
801        | 0x0A47..=0x0A48
802        | 0x0A4B..=0x0A4D
803        | 0x0A51
804        | 0x0A70..=0x0A71
805        | 0x0A75
806        | 0x0A81..=0x0A82
807        | 0x0ABC
808        | 0x0AC1..=0x0AC5
809        | 0x0AC7..=0x0AC8
810        | 0x0ACD
811        | 0x0AE2..=0x0AE3
812        | 0x0AFA..=0x0AFF
813        | 0x0B01
814        | 0x0B3C
815        | 0x0B3F
816        | 0x0B41..=0x0B44
817        | 0x0B4D
818        | 0x0B56
819        | 0x0B62..=0x0B63
820        | 0x0B82
821        | 0x0BC0
822        | 0x0BCD
823        | 0x0C00
824        | 0x0C04
825        | 0x0C3E..=0x0C40
826        | 0x0C46..=0x0C48
827        | 0x0C4A..=0x0C4D
828        | 0x0C55..=0x0C56
829        | 0x0C62..=0x0C63
830        | 0x0C81
831        | 0x0CBC
832        | 0x0CBF
833        | 0x0CC6
834        | 0x0CCC..=0x0CCD
835        | 0x0CE2..=0x0CE3
836        | 0x0D00..=0x0D01
837        | 0x0D3B..=0x0D3C
838        | 0x0D41..=0x0D44
839        | 0x0D4D
840        | 0x0D62..=0x0D63
841        | 0x0DCA
842        | 0x0DD2..=0x0DD4
843        | 0x0DD6
844        | 0x0E31
845        | 0x0E34..=0x0E3A
846        | 0x0E47..=0x0E4E
847        | 0x0EB1
848        | 0x0EB4..=0x0EBC
849        | 0x0EC8..=0x0ECD
850        | 0x0F18..=0x0F19
851        | 0x0F35
852        | 0x0F37
853        | 0x0F39
854        | 0x0F71..=0x0F7E
855        | 0x0F80..=0x0F84
856        | 0x0F86..=0x0F87
857        | 0x0F8D..=0x0F97
858        | 0x0F99..=0x0FBC
859        | 0x0FC6
860        | 0x102D..=0x1030
861        | 0x1032..=0x1037
862        | 0x1039..=0x103A
863        | 0x103D..=0x103E
864        | 0x1058..=0x1059
865        | 0x105E..=0x1060
866        | 0x1071..=0x1074
867        | 0x1082
868        | 0x1085..=0x1086
869        | 0x108D
870        | 0x109D
871        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
872        | 0x135D..=0x135F
873        | 0x1712..=0x1714
874        | 0x1732..=0x1734
875        | 0x1752..=0x1753
876        | 0x1772..=0x1773
877        | 0x17B4..=0x17B5
878        | 0x17B7..=0x17BD
879        | 0x17C6
880        | 0x17C9..=0x17D3
881        | 0x17DD
882        | 0x180B..=0x180D
883        | 0x1885..=0x1886
884        | 0x18A9
885        | 0x1920..=0x1922
886        | 0x1927..=0x1928
887        | 0x1932
888        | 0x1939..=0x193B
889        | 0x1A17..=0x1A18
890        | 0x1A1B
891        | 0x1A56
892        | 0x1A58..=0x1A5E
893        | 0x1A60
894        | 0x1A62
895        | 0x1A65..=0x1A6C
896        | 0x1A73..=0x1A7C
897        | 0x1A7F
898        | 0x1AB0..=0x1ABE
899        | 0x1B00..=0x1B03
900        | 0x1B34
901        | 0x1B36..=0x1B3A
902        | 0x1B3C
903        | 0x1B42
904        | 0x1B6B..=0x1B73
905        | 0x1B80..=0x1B81
906        | 0x1BA2..=0x1BA5
907        | 0x1BA8..=0x1BA9
908        | 0x1BAB..=0x1BAD
909        | 0x1BE6
910        | 0x1BE8..=0x1BE9
911        | 0x1BED
912        | 0x1BEF..=0x1BF1
913        | 0x1C2C..=0x1C33
914        | 0x1C36..=0x1C37
915        | 0x1CD0..=0x1CD2
916        | 0x1CD4..=0x1CE0
917        | 0x1CE2..=0x1CE8
918        | 0x1CED
919        | 0x1CF4
920        | 0x1CF8..=0x1CF9
921        | 0x1DC0..=0x1DF9
922        | 0x1DFB..=0x1DFF
923        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
924        | 0x202A..=0x202E // Bidi control chars
925        | 0x2060..=0x2064 // Word joiner, invisible operators
926        | 0x2066..=0x206F // Bidi isolates
927        | 0x20D0..=0x20F0 // Combining marks for symbols
928        | 0xFE00..=0xFE0F // Variation Selectors
929        | 0xFE20..=0xFE2F // Combining Half Marks
930        | 0xFEFF          // Zero Width No-Break Space (BOM)
931        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
932        | 0x1D167..=0x1D169
933        | 0x1D173..=0x1D182
934        | 0x1D185..=0x1D18B
935        | 0x1D1AA..=0x1D1AD
936        | 0x1D242..=0x1D244
937        | 0xE0001
938        | 0xE0020..=0xE007F
939        | 0xE0100..=0xE01EF // Variation Selectors Supplement
940    )
941}
942
943/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
944/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
945#[inline]
946fn is_wide_char(cp: u32) -> bool {
947    matches!(
948        cp,
949        0x1100..=0x115F   // Hangul Jamo
950        | 0x231A..=0x231B // Watch, Hourglass
951        | 0x2329..=0x232A // Angle Brackets
952        | 0x23E9..=0x23F3 // Various symbols
953        | 0x23F8..=0x23FA
954        | 0x25FD..=0x25FE
955        | 0x2614..=0x2615
956        | 0x2648..=0x2653
957        | 0x267F
958        | 0x2693
959        | 0x26A1
960        | 0x26AA..=0x26AB
961        | 0x26BD..=0x26BE
962        | 0x26C4..=0x26C5
963        | 0x26CE
964        | 0x26D4
965        | 0x26EA
966        | 0x26F2..=0x26F3
967        | 0x26F5
968        | 0x26FA
969        | 0x26FD
970        | 0x2702
971        | 0x2705
972        | 0x2708..=0x270D
973        | 0x270F
974        | 0x2712
975        | 0x2714
976        | 0x2716
977        | 0x271D
978        | 0x2721
979        | 0x2728
980        | 0x2733..=0x2734
981        | 0x2744
982        | 0x2747
983        | 0x274C
984        | 0x274E
985        | 0x2753..=0x2755
986        | 0x2757
987        | 0x2763..=0x2764
988        | 0x2795..=0x2797
989        | 0x27A1
990        | 0x27B0
991        | 0x27BF
992        | 0x2934..=0x2935
993        | 0x2B05..=0x2B07
994        | 0x2B1B..=0x2B1C
995        | 0x2B50
996        | 0x2B55
997        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
998        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
999        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1000        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1001        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1002        | 0xAC00..=0xD7A3  // Hangul Syllables
1003        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1004        | 0xFE10..=0xFE19  // Vertical Forms
1005        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1006        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1007        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1008        | 0x1F004
1009        | 0x1F0CF
1010        | 0x1F170..=0x1F171
1011        | 0x1F17E..=0x1F17F
1012        | 0x1F18E
1013        | 0x1F191..=0x1F19A
1014        | 0x1F1E0..=0x1F1FF // Regional Indicators
1015        | 0x1F200..=0x1F202
1016        | 0x1F210..=0x1F23B
1017        | 0x1F240..=0x1F248
1018        | 0x1F250..=0x1F251
1019        | 0x1F260..=0x1F265
1020        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1021        | 0x1F680..=0x1F6FF // Transport Symbols
1022        | 0x1F900..=0x1F9FF // Supplemental Symbols
1023        | 0x1FA00..=0x1FA6F
1024        | 0x1FA70..=0x1FAFF
1025        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1026        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1027    )
1028}
1029
1030/// Compute maximum display width of any line (C/POSIX locale).
1031///
1032/// GNU wc -L behavior in C locale:
1033/// - `\n`: line terminator (records max, resets position)
1034/// - `\t`: advances to next tab stop (multiple of 8)
1035/// - `\r`: carriage return (resets position to 0, same line)
1036/// - `\f`: form feed (acts as line terminator like \n)
1037/// - Printable ASCII (0x20..0x7E): width 1
1038/// - Everything else (controls, high bytes): width 0
1039///
1040/// Optimized with printable ASCII run counting: for runs of bytes in
1041/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1042pub fn max_line_length_c(data: &[u8]) -> u64 {
1043    let mut max_len: u64 = 0;
1044    let mut line_len: u64 = 0;
1045    let mut linepos: u64 = 0;
1046    let mut i = 0;
1047    let len = data.len();
1048
1049    while i < len {
1050        let b = unsafe { *data.get_unchecked(i) };
1051        if b >= 0x21 && b <= 0x7E {
1052            // Printable non-space ASCII — count run length
1053            i += 1;
1054            let mut run = 1u64;
1055            while i < len {
1056                let b = unsafe { *data.get_unchecked(i) };
1057                if b >= 0x21 && b <= 0x7E {
1058                    run += 1;
1059                    i += 1;
1060                } else {
1061                    break;
1062                }
1063            }
1064            linepos += run;
1065            if linepos > line_len {
1066                line_len = linepos;
1067            }
1068        } else {
1069            match b {
1070                b' ' => {
1071                    linepos += 1;
1072                    if linepos > line_len {
1073                        line_len = linepos;
1074                    }
1075                }
1076                b'\n' => {
1077                    if line_len > max_len {
1078                        max_len = line_len;
1079                    }
1080                    linepos = 0;
1081                    line_len = 0;
1082                }
1083                b'\t' => {
1084                    linepos = (linepos + 8) & !7;
1085                    if linepos > line_len {
1086                        line_len = linepos;
1087                    }
1088                }
1089                b'\r' => {
1090                    linepos = 0;
1091                }
1092                0x0C => {
1093                    if line_len > max_len {
1094                        max_len = line_len;
1095                    }
1096                    linepos = 0;
1097                    line_len = 0;
1098                }
1099                _ => {} // Non-printable: width 0
1100            }
1101            i += 1;
1102        }
1103    }
1104
1105    if line_len > max_len {
1106        max_len = line_len;
1107    }
1108
1109    max_len
1110}
1111
1112/// Compute maximum display width of any line (UTF-8 locale).
1113///
1114/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1115/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1116///
1117/// Optimized with printable ASCII run counting for common text.
1118pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1119    let mut max_len: u64 = 0;
1120    let mut line_len: u64 = 0;
1121    let mut linepos: u64 = 0;
1122    let mut i = 0;
1123    let len = data.len();
1124
1125    while i < len {
1126        let b = unsafe { *data.get_unchecked(i) };
1127
1128        if b >= 0x21 && b <= 0x7E {
1129            // Printable non-space ASCII (most common) — count run length
1130            i += 1;
1131            let mut run = 1u64;
1132            while i < len {
1133                let b = unsafe { *data.get_unchecked(i) };
1134                if b >= 0x21 && b <= 0x7E {
1135                    run += 1;
1136                    i += 1;
1137                } else {
1138                    break;
1139                }
1140            }
1141            linepos += run;
1142            if linepos > line_len {
1143                line_len = linepos;
1144            }
1145        } else if b < 0x80 {
1146            // Other ASCII: space, tab, newline, controls
1147            match b {
1148                b' ' => {
1149                    linepos += 1;
1150                    if linepos > line_len {
1151                        line_len = linepos;
1152                    }
1153                }
1154                b'\n' => {
1155                    if line_len > max_len {
1156                        max_len = line_len;
1157                    }
1158                    linepos = 0;
1159                    line_len = 0;
1160                }
1161                b'\t' => {
1162                    linepos = (linepos + 8) & !7;
1163                    if linepos > line_len {
1164                        line_len = linepos;
1165                    }
1166                }
1167                b'\r' => {
1168                    linepos = 0;
1169                }
1170                0x0C => {
1171                    if line_len > max_len {
1172                        max_len = line_len;
1173                    }
1174                    linepos = 0;
1175                    line_len = 0;
1176                }
1177                _ => {} // Non-printable: width 0
1178            }
1179            i += 1;
1180        } else {
1181            // Multibyte UTF-8
1182            let (cp, len) = decode_utf8(&data[i..]);
1183
1184            // C1 control characters (0x80..0x9F): non-printable, width 0
1185            if cp <= 0x9F {
1186                // width 0
1187            } else if is_zero_width(cp) {
1188                // Combining marks, zero-width chars: width 0
1189            } else if is_wide_char(cp) {
1190                linepos += 2;
1191                if linepos > line_len {
1192                    line_len = linepos;
1193                }
1194            } else {
1195                // Regular printable Unicode character: width 1
1196                linepos += 1;
1197                if linepos > line_len {
1198                    line_len = linepos;
1199                }
1200            }
1201            i += len;
1202        }
1203    }
1204
1205    // Handle last line
1206    if line_len > max_len {
1207        max_len = line_len;
1208    }
1209
1210    max_len
1211}
1212
1213/// Compute maximum display width, choosing behavior based on locale.
1214#[inline]
1215pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1216    if utf8 {
1217        max_line_length_utf8(data)
1218    } else {
1219        max_line_length_c(data)
1220    }
1221}
1222
1223/// Count all metrics using optimized individual passes.
1224///
1225/// Each metric uses its own optimized algorithm:
1226/// - Lines: SIMD-accelerated memchr
1227/// - Words: 3-state scalar/state-machine (locale-dependent)
1228/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1229/// - Max line length: locale-aware display width tracking
1230///
1231/// Multi-pass is faster than single-pass because each pass has a tight,
1232/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1233/// making subsequent passes nearly free for memory bandwidth.
1234pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1235    if utf8 {
1236        let (lines, words) = count_lines_words_utf8_fused(data);
1237        WcCounts {
1238            lines,
1239            words,
1240            bytes: data.len() as u64,
1241            chars: count_chars_utf8(data),
1242            max_line_length: max_line_length_utf8(data),
1243        }
1244    } else {
1245        WcCounts {
1246            lines: count_lines(data),
1247            words: count_words_locale(data, false),
1248            bytes: data.len() as u64,
1249            chars: data.len() as u64,
1250            max_line_length: max_line_length_c(data),
1251        }
1252    }
1253}
1254
1255/// Quick check if data is likely all-ASCII by sampling three regions.
1256/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1257/// If any byte >= 0x80 is found, returns false.
1258#[inline]
1259fn check_ascii_sample(data: &[u8]) -> bool {
1260    let len = data.len();
1261    if len == 0 {
1262        return true;
1263    }
1264
1265    // Check in 8-byte blocks using OR-accumulation for speed
1266    let check_region = |start: usize, end: usize| -> bool {
1267        let mut or_acc = 0u8;
1268        let region = &data[start..end];
1269        let mut i = 0;
1270        while i + 8 <= region.len() {
1271            unsafe {
1272                or_acc |= *region.get_unchecked(i);
1273                or_acc |= *region.get_unchecked(i + 1);
1274                or_acc |= *region.get_unchecked(i + 2);
1275                or_acc |= *region.get_unchecked(i + 3);
1276                or_acc |= *region.get_unchecked(i + 4);
1277                or_acc |= *region.get_unchecked(i + 5);
1278                or_acc |= *region.get_unchecked(i + 6);
1279                or_acc |= *region.get_unchecked(i + 7);
1280            }
1281            i += 8;
1282        }
1283        while i < region.len() {
1284            or_acc |= region[i];
1285            i += 1;
1286        }
1287        or_acc < 0x80
1288    };
1289
1290    let sample = 256.min(len);
1291
1292    // Check beginning
1293    if !check_region(0, sample) {
1294        return false;
1295    }
1296    // Check middle
1297    if len > sample * 2 {
1298        let mid = len / 2;
1299        let mid_start = mid.saturating_sub(sample / 2);
1300        if !check_region(mid_start, (mid_start + sample).min(len)) {
1301            return false;
1302        }
1303    }
1304    // Check end
1305    if len > sample {
1306        if !check_region(len - sample, len) {
1307            return false;
1308        }
1309    }
1310
1311    true
1312}
1313
1314// ──────────────────────────────────────────────────
1315// Parallel counting for large files
1316// ──────────────────────────────────────────────────
1317
1318/// Count newlines in parallel using SIMD memchr + rayon.
1319/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1320pub fn count_lines_parallel(data: &[u8]) -> u64 {
1321    if data.len() < PARALLEL_THRESHOLD {
1322        return count_lines(data);
1323    }
1324
1325    let num_threads = rayon::current_num_threads().max(1);
1326    // Ensure chunks are large enough to amortize SIMD setup overhead
1327    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1328
1329    data.par_chunks(chunk_size)
1330        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1331        .sum()
1332}
1333
1334/// Count words in parallel with boundary adjustment.
1335pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1336    if utf8 || data.len() < PARALLEL_THRESHOLD {
1337        // UTF-8: state machine can't be trivially parallelized
1338        // (multi-byte sequences may span chunk boundaries).
1339        return count_words_locale(data, utf8);
1340    }
1341
1342    // C locale: parallel 3-state word counting with boundary adjustment
1343    let num_threads = rayon::current_num_threads().max(1);
1344    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1345
1346    let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1347
1348    // Each chunk returns (lines, word_count, first_active_is_printable, ends_in_word)
1349    let results: Vec<(u64, u64, bool, bool)> = chunks
1350        .par_iter()
1351        .map(|chunk| count_lw_c_chunk(chunk))
1352        .collect();
1353
1354    let mut total = 0u64;
1355    for i in 0..results.len() {
1356        total += results[i].1;
1357        // Boundary adjustment: if previous chunk ended in_word AND
1358        // current chunk's first non-transparent byte is printable,
1359        // the word was split across chunks — subtract the overcount.
1360        if i > 0 && results[i - 1].3 && results[i].2 {
1361            total -= 1;
1362        }
1363    }
1364    total
1365}
1366
1367/// Count UTF-8 characters in parallel.
1368pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1369    if !utf8 {
1370        return data.len() as u64;
1371    }
1372    if data.len() < PARALLEL_THRESHOLD {
1373        return count_chars_utf8(data);
1374    }
1375
1376    let num_threads = rayon::current_num_threads().max(1);
1377    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1378
1379    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1380}
1381
1382/// Count lines + words + bytes in a single fused pass (the default wc mode).
1383/// Avoids separate passes entirely — combines newline counting with word detection.
1384pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1385    let (lines, words) = count_lines_words(data, utf8);
1386    (lines, words, data.len() as u64)
1387}
1388
1389/// Parallel counting of lines + words + bytes only (no chars).
1390/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1391/// C locale: single fused pass per chunk counts BOTH lines and words.
1392/// UTF-8 with pure ASCII data: falls back to parallel C locale path.
1393pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1394    if data.len() < PARALLEL_THRESHOLD {
1395        // Small file: use fused single-pass
1396        return count_lwb(data, utf8);
1397    }
1398
1399    // For UTF-8 locale: check if data is pure ASCII first.
1400    // If so, UTF-8 and C locale produce identical word counts,
1401    // and we can use the parallelizable C locale path.
1402    let effective_utf8 = if utf8 {
1403        // Quick ASCII check: sample first, middle, last 256 bytes
1404        let is_ascii = check_ascii_sample(data);
1405        if is_ascii {
1406            false // Use C locale parallel path
1407        } else {
1408            true // Need sequential UTF-8 path
1409        }
1410    } else {
1411        false
1412    };
1413
1414    let (lines, words) = if effective_utf8 {
1415        // Must be sequential for UTF-8 with non-ASCII data
1416        count_lines_words_utf8_fused(data)
1417    } else {
1418        // C locale: FUSED parallel lines+words counting — single pass per chunk
1419        let num_threads = rayon::current_num_threads().max(1);
1420        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1421
1422        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1423        let results: Vec<(u64, u64, bool, bool)> = chunks
1424            .par_iter()
1425            .map(|chunk| count_lw_c_chunk(chunk))
1426            .collect();
1427
1428        let mut line_total = 0u64;
1429        let mut word_total = 0u64;
1430        for i in 0..results.len() {
1431            line_total += results[i].0;
1432            word_total += results[i].1;
1433            if i > 0 && results[i - 1].3 && results[i].2 {
1434                word_total -= 1;
1435            }
1436        }
1437
1438        (line_total, word_total)
1439    };
1440
1441    (lines, words, data.len() as u64)
1442}
1443
1444/// Combined parallel counting of lines + words + chars.
1445pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1446    if data.len() < PARALLEL_THRESHOLD {
1447        let lines = count_lines(data);
1448        let words = count_words_locale(data, utf8);
1449        let chars = count_chars(data, utf8);
1450        return (lines, words, chars);
1451    }
1452
1453    // Word counting: sequential for UTF-8 (state machine), parallel for C locale
1454    let words = count_words_parallel(data, utf8);
1455
1456    // Lines and chars can always be parallelized safely
1457    let num_threads = rayon::current_num_threads().max(1);
1458    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1459
1460    let lines: u64 = data
1461        .par_chunks(chunk_size)
1462        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1463        .sum();
1464
1465    let chars = if utf8 {
1466        data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1467    } else {
1468        data.len() as u64
1469    };
1470
1471    (lines, words, chars)
1472}