Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (1MB).
5/// Rayon overhead is ~5-10μs per task; at 1MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~100μs, so overhead is < 10%.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// Byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// GNU wc 9.4 uses a 3-state model for word counting in UTF-8 locales:
24//   - Space (word-break): whitespace characters
25//   - Printable (word content): printable characters (ASCII 0x21-0x7E, valid Unicode)
26//   - Transparent (no state change): NUL, control chars, DEL, invalid/overlong
27//     UTF-8, and non-printable Unicode characters
28//
29// In the C locale fast path (3-state matching GNU wc 9.4):
30//   - Space {0x09-0x0D, 0x20}: word break (isspace())
31//   - Printable {0x21-0x7E}: word content (isprint(), excluding space)
32//   - Transparent (everything else: 0x00-0x08, 0x0E-0x1F, 0x7F-0xFF): no state change
33//
34// In UTF-8 locale with multibyte path:
35//   - ASCII bytes use the IS_SPACE table (only checked for bytes < 0x80)
36//   - Valid multibyte chars: iswspace() for space, iswprint() for word content
37//   - U+00A0 and other non-breaking spaces: word break (via is_wnbspace)
38//   - Non-printable Unicode: transparent (no state change)
39//   - Encoding errors (EILSEQ): transparent (no state change)
40
41/// Byte-level space table for the C locale fast path.
42/// true = whitespace (word break), false = not whitespace.
43/// The 6 whitespace bytes that match glibc isspace() in C locale:
44/// {0x09-0x0D, 0x20}.
45const fn make_is_space() -> [bool; 256] {
46    let mut t = [false; 256];
47    t[0x09] = true; // tab
48    t[0x0A] = true; // newline
49    t[0x0B] = true; // vertical tab
50    t[0x0C] = true; // form feed
51    t[0x0D] = true; // carriage return
52    t[0x20] = true; // space
53    t
54}
55const IS_SPACE: [bool; 256] = make_is_space();
56
57/// Byte-level printable table for the C locale 3-state model.
58/// true = printable (word content), false = not printable.
59/// Matches glibc isprint() in C locale: {0x20-0x7E}.
60/// Note: 0x20 is space (caught first by IS_SPACE), so effectively {0x21-0x7E}.
61const fn make_is_print() -> [bool; 256] {
62    let mut t = [false; 256];
63    let mut i = 0x20u16;
64    while i <= 0x7E {
65        t[i as usize] = true;
66        i += 1;
67    }
68    t
69}
70const IS_PRINT: [bool; 256] = make_is_print();
71
72/// For parallel chunk merging: determine if a chunk starts with word content.
73/// In C locale 3-state model, only printable bytes (0x21-0x7E) are word content.
74/// Transparent bytes (controls, high bytes) do not start words.
75#[inline]
76pub(crate) fn first_is_word(data: &[u8]) -> bool {
77    !data.is_empty() && IS_PRINT[data[0] as usize] && !IS_SPACE[data[0] as usize]
78}
79
80// ──────────────────────────────────────────────────
81// Unicode character classification helpers
82// ──────────────────────────────────────────────────
83
84/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
85/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
86#[inline]
87fn is_unicode_space(cp: u32) -> bool {
88    matches!(
89        cp,
90        0x1680 |           // Ogham Space Mark
91        0x2000
92            ..=0x200A |  // En Quad through Hair Space
93        0x2028 |           // Line Separator
94        0x2029 |           // Paragraph Separator
95        0x205F |           // Medium Mathematical Space
96        0x3000 // Ideographic Space
97    )
98}
99
100/// Check if a Unicode codepoint is a "non-breaking space" that GNU wc treats
101/// as a word separator (when POSIXLY_CORRECT is not set).
102/// Matches GNU wc 9.7 iswnbspace(): U+00A0, U+2007, U+202F, U+2060.
103#[inline]
104fn is_wnbspace(cp: u32) -> bool {
105    matches!(cp, 0x00A0 | 0x2007 | 0x202F | 0x2060)
106}
107
108/// Check if a Unicode codepoint is any kind of space (iswspace || iswnbspace).
109#[inline]
110fn is_unicode_word_break(cp: u32) -> bool {
111    is_unicode_space(cp) || is_wnbspace(cp)
112}
113
114/// Check if a Unicode codepoint is "printable" for the 3-state word counting model.
115/// Matches glibc's iswprint(): true for graphic characters and space-like characters,
116/// false for control characters and unassigned/private-use.
117/// In practice, almost all valid Unicode codepoints >= 0x80 that aren't spaces are printable.
118#[inline]
119fn is_printable_unicode(cp: u32) -> bool {
120    // Categories NOT printable: C0/C1 controls, surrogates, noncharacters, unassigned above Plane 16
121    // For word counting purposes, we consider all valid Unicode >= 0xA0 that isn't
122    // a control character as printable. This matches glibc iswprint() for common text.
123    if cp < 0xA0 {
124        // U+0000-0x001F: C0 controls (not printable)
125        // U+0020-0x007E: ASCII printable (handled separately in ASCII path)
126        // U+007F: DEL (not printable)
127        // U+0080-0x009F: C1 controls (not printable)
128        return false;
129    }
130    // Surrogates (U+D800-U+DFFF) and noncharacters shouldn't appear in valid UTF-8,
131    // but mark them non-printable for safety
132    if (0xD800..=0xDFFF).contains(&cp) || cp > 0x10FFFF {
133        return false;
134    }
135    // Unicode noncharacters: U+FDD0-U+FDEF, U+xFFFE-U+xFFFF
136    if (0xFDD0..=0xFDEF).contains(&cp) || (cp & 0xFFFE) == 0xFFFE {
137        return false;
138    }
139    true
140}
141
142// ──────────────────────────────────────────────────
143// Core counting functions
144// ──────────────────────────────────────────────────
145
146/// Count newlines using SIMD-accelerated memchr.
147/// GNU wc counts newline bytes (`\n`), not logical lines.
148#[inline]
149pub fn count_lines(data: &[u8]) -> u64 {
150    memchr_iter(b'\n', data).count() as u64
151}
152
153/// Count bytes. Trivial but included for API consistency.
154#[inline]
155pub fn count_bytes(data: &[u8]) -> u64 {
156    data.len() as u64
157}
158
159/// Count words using locale-aware logic (default: UTF-8 3-state).
160pub fn count_words(data: &[u8]) -> u64 {
161    count_words_locale(data, true)
162}
163
164/// Count words with explicit locale control.
165///
166/// In C locale: 3-state (space/printable/transparent) matching GNU wc 9.4.
167/// In UTF-8 locale: 3-state with multibyte Unicode classification.
168pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
169    if utf8 {
170        count_words_utf8(data)
171    } else {
172        count_words_c(data)
173    }
174}
175
176/// Count words in C/POSIX locale using 3-state logic matching GNU wc 9.4.
177/// GNU wc's C locale path uses isspace() and isprint() to classify bytes:
178///   - Space {0x09-0x0D, 0x20}: word break (sets in_word=false)
179///   - Printable {0x21-0x7E}: word content (starts/continues word)
180///   - Transparent (everything else: 0x00-0x08, 0x0E-0x1F, 0x7F-0xFF): no state change
181fn count_words_c(data: &[u8]) -> u64 {
182    let mut words = 0u64;
183    let mut in_word = false;
184    let mut i = 0;
185    let len = data.len();
186
187    while i < len {
188        let b = unsafe { *data.get_unchecked(i) };
189        if IS_SPACE[b as usize] {
190            in_word = false;
191        } else if IS_PRINT[b as usize] {
192            if !in_word {
193                in_word = true;
194                words += 1;
195            }
196        }
197        // else: transparent byte — no state change
198        i += 1;
199    }
200    words
201}
202
203/// Scalar tail for SIMD line+word counters: processes remaining bytes after
204/// the SIMD loop and returns final counts with boundary info.
205/// SAFETY: caller must ensure ptr is valid for [0..len) and i <= len.
206#[cfg(target_arch = "x86_64")]
207#[inline(always)]
208fn count_lw_c_scalar_tail(
209    ptr: *const u8,
210    mut i: usize,
211    len: usize,
212    mut total_lines: u64,
213    mut total_words: u64,
214    mut prev_in_word: bool,
215    data: &[u8],
216) -> (u64, u64, bool, bool) {
217    // C locale 3-state model: space breaks words, printable starts/continues,
218    // transparent (controls, high bytes) has no state change.
219    while i < len {
220        let b = unsafe { *ptr.add(i) };
221        if IS_SPACE[b as usize] {
222            if b == b'\n' {
223                total_lines += 1;
224            }
225            prev_in_word = false;
226        } else if IS_PRINT[b as usize] && !prev_in_word {
227            total_words += 1;
228            prev_in_word = true;
229        }
230        // transparent: no change to prev_in_word
231        i += 1;
232    }
233    let first_word = first_is_word(data);
234    (total_lines, total_words, first_word, prev_in_word)
235}
236
237/// AVX2-accelerated fused line+word counter for C locale chunks.
238/// Processes 32 bytes per iteration using 3-state logic matching GNU wc 9.4:
239///   - Space: {0x09-0x0D, 0x20} — ends word
240///   - Printable: {0x21-0x7E} — starts/continues word
241///   - Transparent: everything else — no state change
242/// When no transparent bytes are present in a chunk (pure ASCII text),
243/// uses fast 2-state bitmask logic. Falls back to scalar for chunks with
244/// transparent bytes (UTF-8 content, control chars).
245#[cfg(target_arch = "x86_64")]
246#[target_feature(enable = "avx2")]
247unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
248    use std::arch::x86_64::*;
249
250    let len = data.len();
251    let ptr = data.as_ptr();
252    let mut i = 0usize;
253    let mut total_lines = 0u64;
254    let mut total_words = 0u64;
255    let mut prev_in_word = false;
256
257    unsafe {
258        let nl_byte = _mm256_set1_epi8(b'\n' as i8);
259        let zero = _mm256_setzero_si256();
260        let ones = _mm256_set1_epi8(1);
261        // Space detection: {0x09-0x0D, 0x20}
262        let const_0x09 = _mm256_set1_epi8(0x09u8 as i8);
263        let const_0x0d = _mm256_set1_epi8(0x0Du8 as i8);
264        let const_0x20 = _mm256_set1_epi8(0x20u8 as i8);
265        // Printable detection: {0x21-0x7E}
266        let const_0x21 = _mm256_set1_epi8(0x21u8 as i8);
267        let const_0x7e = _mm256_set1_epi8(0x7Eu8 as i8);
268
269        let mut line_acc = _mm256_setzero_si256();
270        let mut batch = 0u32;
271
272        while i + 32 <= len {
273            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
274            let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
275            line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
276
277            // Space check: byte in {0x09-0x0D, 0x20}
278            let ge_09 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x09), v);
279            let le_0d = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x0d), v);
280            let in_tab_range = _mm256_and_si256(ge_09, le_0d);
281            let is_sp = _mm256_cmpeq_epi8(v, const_0x20);
282            let is_space = _mm256_or_si256(in_tab_range, is_sp);
283            let space_mask = _mm256_movemask_epi8(is_space) as u32;
284
285            // Printable check: byte in {0x21-0x7E}
286            let ge_21 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x21), v);
287            let le_7e = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x7e), v);
288            let is_print = _mm256_and_si256(ge_21, le_7e);
289            let print_mask = _mm256_movemask_epi8(is_print) as u32;
290
291            let transparent_mask = !(space_mask | print_mask);
292            if transparent_mask == 0 {
293                // No transparent bytes: 2-state is equivalent to 3-state
294                let prev_space = (space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 };
295                let starts = print_mask & prev_space;
296                total_words += starts.count_ones() as u64;
297                prev_in_word = (print_mask >> 31) & 1 == 1;
298            } else {
299                // Has transparent bytes: scalar 3-state for this 32-byte chunk
300                let end = (i + 32).min(len);
301                for j in i..end {
302                    let b = *ptr.add(j);
303                    if IS_SPACE[b as usize] {
304                        prev_in_word = false;
305                    } else if IS_PRINT[b as usize] && !prev_in_word {
306                        total_words += 1;
307                        prev_in_word = true;
308                    }
309                }
310            }
311
312            batch += 1;
313            if batch >= 255 {
314                let sad = _mm256_sad_epu8(line_acc, zero);
315                let hi = _mm256_extracti128_si256(sad, 1);
316                let lo = _mm256_castsi256_si128(sad);
317                let s = _mm_add_epi64(lo, hi);
318                let h64 = _mm_unpackhi_epi64(s, s);
319                let t = _mm_add_epi64(s, h64);
320                total_lines += _mm_cvtsi128_si64(t) as u64;
321                line_acc = _mm256_setzero_si256();
322                batch = 0;
323            }
324            i += 32;
325        }
326
327        if batch > 0 {
328            let sad = _mm256_sad_epu8(line_acc, zero);
329            let hi = _mm256_extracti128_si256(sad, 1);
330            let lo = _mm256_castsi256_si128(sad);
331            let s = _mm_add_epi64(lo, hi);
332            let h64 = _mm_unpackhi_epi64(s, s);
333            let t = _mm_add_epi64(s, h64);
334            total_lines += _mm_cvtsi128_si64(t) as u64;
335        }
336    }
337
338    count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
339}
340
341/// SSE2 variant of count_lw_c_chunk_avx2 — processes 16 bytes per iteration.
342/// See AVX2 function above for algorithm details (3-state with transparent fallback).
343#[cfg(target_arch = "x86_64")]
344#[target_feature(enable = "sse2")]
345unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
346    use std::arch::x86_64::*;
347
348    let len = data.len();
349    let ptr = data.as_ptr();
350    let mut i = 0usize;
351    let mut total_lines = 0u64;
352    let mut total_words = 0u64;
353    let mut prev_in_word = false;
354
355    unsafe {
356        let nl_byte = _mm_set1_epi8(b'\n' as i8);
357        let zero = _mm_setzero_si128();
358        let ones = _mm_set1_epi8(1);
359        let const_0x09 = _mm_set1_epi8(0x09u8 as i8);
360        let const_0x0d = _mm_set1_epi8(0x0Du8 as i8);
361        let const_0x20 = _mm_set1_epi8(0x20u8 as i8);
362        let const_0x21 = _mm_set1_epi8(0x21u8 as i8);
363        let const_0x7e = _mm_set1_epi8(0x7Eu8 as i8);
364
365        let mut line_acc = _mm_setzero_si128();
366        let mut batch = 0u32;
367
368        while i + 16 <= len {
369            let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
370            let is_nl = _mm_cmpeq_epi8(v, nl_byte);
371            line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
372
373            // Space: {0x09-0x0D, 0x20}
374            let ge_09 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x09), v);
375            let le_0d = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x0d), v);
376            let in_tab_range = _mm_and_si128(ge_09, le_0d);
377            let is_sp = _mm_cmpeq_epi8(v, const_0x20);
378            let is_space = _mm_or_si128(in_tab_range, is_sp);
379            let space_mask = (_mm_movemask_epi8(is_space) as u32) & 0xFFFF;
380
381            // Printable: {0x21-0x7E}
382            let ge_21 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x21), v);
383            let le_7e = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x7e), v);
384            let is_print = _mm_and_si128(ge_21, le_7e);
385            let print_mask = (_mm_movemask_epi8(is_print) as u32) & 0xFFFF;
386
387            let transparent_mask = !(space_mask | print_mask) & 0xFFFF;
388            if transparent_mask == 0 {
389                let prev_space =
390                    ((space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 }) & 0xFFFF;
391                let starts = print_mask & prev_space;
392                total_words += starts.count_ones() as u64;
393                prev_in_word = (print_mask >> 15) & 1 == 1;
394            } else {
395                let end = (i + 16).min(len);
396                for j in i..end {
397                    let b = *ptr.add(j);
398                    if IS_SPACE[b as usize] {
399                        prev_in_word = false;
400                    } else if IS_PRINT[b as usize] && !prev_in_word {
401                        total_words += 1;
402                        prev_in_word = true;
403                    }
404                }
405            }
406
407            batch += 1;
408            if batch >= 255 {
409                let sad = _mm_sad_epu8(line_acc, zero);
410                let hi = _mm_unpackhi_epi64(sad, sad);
411                let t = _mm_add_epi64(sad, hi);
412                total_lines += _mm_cvtsi128_si64(t) as u64;
413                line_acc = _mm_setzero_si128();
414                batch = 0;
415            }
416            i += 16;
417        }
418
419        if batch > 0 {
420            let sad = _mm_sad_epu8(line_acc, zero);
421            let hi = _mm_unpackhi_epi64(sad, sad);
422            let t = _mm_add_epi64(sad, hi);
423            total_lines += _mm_cvtsi128_si64(t) as u64;
424        }
425    }
426
427    count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
428}
429
430/// Dispatch to AVX2, SSE2, or scalar chunk counter.
431#[inline]
432fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
433    #[cfg(target_arch = "x86_64")]
434    {
435        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
436            return unsafe { count_lw_c_chunk_avx2(data) };
437        }
438        if data.len() >= 32 {
439            return unsafe { count_lw_c_chunk_sse2(data) };
440        }
441    }
442    count_lw_c_chunk(data)
443}
444
445/// Count words + lines in a C locale chunk using 3-state logic, returning
446/// counts plus boundary info for parallel chunk merging.
447/// Returns (line_count, word_count, first_is_word_content, ends_in_word).
448fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
449    let mut lines = 0u64;
450    let mut words = 0u64;
451    let mut in_word = false;
452    let mut i = 0;
453    let len = data.len();
454
455    let first_word = first_is_word(data);
456
457    // C locale 3-state model: space breaks words, printable starts/continues,
458    // transparent (controls, high bytes) has no state change.
459    while i < len {
460        let b = unsafe { *data.get_unchecked(i) };
461        if IS_SPACE[b as usize] {
462            if b == b'\n' {
463                lines += 1;
464            }
465            in_word = false;
466        } else if IS_PRINT[b as usize] {
467            if !in_word {
468                in_word = true;
469                words += 1;
470            }
471        }
472        // else: transparent — no state change
473        i += 1;
474    }
475    (lines, words, first_word, in_word)
476}
477
478/// Count words in UTF-8 locale using 3-state logic matching GNU wc 9.4.
479///
480/// Handles:
481/// - ASCII spaces (0x09-0x0D, 0x20): word break
482/// - ASCII printable (0x21-0x7E): word content
483/// - ASCII non-printable, non-space (NUL, controls, DEL): transparent (no state change)
484/// - Valid UTF-8 multi-byte Unicode spaces (iswspace): word break
485/// - Non-breaking spaces (U+00A0, U+2007, U+202F, U+2060): word break (iswnbspace)
486/// - Valid UTF-8 printable non-space chars: word content
487/// - Non-printable Unicode (C1 controls, etc.): transparent
488/// - Invalid UTF-8 encoding errors: transparent (matches GNU wc 9.4 EILSEQ handling)
489fn count_words_utf8(data: &[u8]) -> u64 {
490    let mut words = 0u64;
491    let mut in_word = false;
492    let mut i = 0;
493    let len = data.len();
494
495    while i < len {
496        let b = unsafe { *data.get_unchecked(i) };
497
498        if b < 0x80 {
499            // ASCII byte — 3-state matching GNU wc 9.4:
500            // Space (0x09-0x0D, 0x20): word break
501            // Printable non-space (0x21-0x7E): word content
502            // Non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
503            if IS_SPACE[b as usize] {
504                in_word = false;
505            } else if b >= 0x21 && b <= 0x7E {
506                // Printable ASCII: word content
507                if !in_word {
508                    in_word = true;
509                    words += 1;
510                }
511            }
512            // else: non-printable, non-space → transparent (no state change)
513            i += 1;
514        } else if b < 0xC2 {
515            // Invalid UTF-8: bare continuation byte (0x80-0xBF) or overlong (0xC0-0xC1)
516            // Encoding error → transparent (matches GNU wc 9.4 EILSEQ handling)
517            i += 1;
518        } else if b < 0xE0 {
519            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
520                let cp = ((b as u32 & 0x1F) << 6)
521                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
522                if is_unicode_word_break(cp) {
523                    in_word = false;
524                } else if is_printable_unicode(cp) {
525                    if !in_word {
526                        in_word = true;
527                        words += 1;
528                    }
529                }
530                // else: non-printable, non-space → transparent
531                i += 2;
532            } else {
533                // Incomplete sequence → transparent (encoding error)
534                i += 1;
535            }
536        } else if b < 0xF0 {
537            if i + 2 < len
538                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
539                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
540            {
541                let cp = ((b as u32 & 0x0F) << 12)
542                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
543                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
544                if is_unicode_word_break(cp) {
545                    in_word = false;
546                } else if is_printable_unicode(cp) {
547                    if !in_word {
548                        in_word = true;
549                        words += 1;
550                    }
551                }
552                // else: non-printable, non-space → transparent
553                i += 3;
554            } else {
555                // Incomplete sequence → transparent (encoding error)
556                i += 1;
557            }
558        } else if b < 0xF5 {
559            if i + 3 < len
560                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
561                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
562                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
563            {
564                let cp = ((b as u32 & 0x07) << 18)
565                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
566                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
567                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
568                if is_unicode_word_break(cp) {
569                    in_word = false;
570                } else if is_printable_unicode(cp) {
571                    if !in_word {
572                        in_word = true;
573                        words += 1;
574                    }
575                }
576                // else: non-printable, non-space → transparent
577                i += 4;
578            } else {
579                // Incomplete sequence → transparent (encoding error)
580                i += 1;
581            }
582        } else {
583            // Invalid byte >= 0xF5 → transparent (encoding error)
584            i += 1;
585        }
586    }
587
588    words
589}
590
591/// Count lines and words using optimized strategies per locale.
592/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
593/// C locale: AVX2 SIMD fused counter when available, scalar fallback otherwise.
594pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
595    if utf8 {
596        count_lines_words_utf8_fused(data)
597    } else {
598        let (lines, words, _, _) = count_lw_c_chunk_fast(data);
599        (lines, words)
600    }
601}
602
603/// Fused lines+words counting in UTF-8 mode (single pass).
604/// Avoids separate memchr pass for newlines by counting them inline with words.
605/// Uses 3-state logic matching GNU wc 9.4:
606///   - Encoding errors are transparent (no state change, matching GNU wc EILSEQ)
607///   - ASCII non-printable, non-space bytes (NUL, controls) are transparent
608///   - Printable non-space chars are word content
609///   - Whitespace chars are word breaks
610fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
611    let mut lines = 0u64;
612    let mut words = 0u64;
613    let mut in_word = false;
614    let mut i = 0;
615    let len = data.len();
616
617    while i < len {
618        let b = unsafe { *data.get_unchecked(i) };
619
620        if b == b'\n' {
621            lines += 1;
622            in_word = false;
623            i += 1;
624        } else if b < 0x80 {
625            // ASCII byte — 3-state matching GNU wc 9.4:
626            // Space: word break. Printable (0x21-0x7E): word content.
627            // Non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent.
628            if IS_SPACE[b as usize] {
629                in_word = false;
630            } else if b >= 0x21 && b <= 0x7E {
631                if !in_word {
632                    in_word = true;
633                    words += 1;
634                }
635            }
636            // else: transparent
637            i += 1;
638        } else if b < 0xC2 {
639            // Invalid UTF-8 → transparent (encoding error)
640            i += 1;
641        } else if b < 0xE0 {
642            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
643                let cp = ((b as u32 & 0x1F) << 6)
644                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
645                if is_unicode_word_break(cp) {
646                    in_word = false;
647                } else if is_printable_unicode(cp) {
648                    if !in_word {
649                        in_word = true;
650                        words += 1;
651                    }
652                }
653                i += 2;
654            } else {
655                // Incomplete → transparent
656                i += 1;
657            }
658        } else if b < 0xF0 {
659            if i + 2 < len
660                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
661                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
662            {
663                let cp = ((b as u32 & 0x0F) << 12)
664                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
665                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
666                if is_unicode_word_break(cp) {
667                    in_word = false;
668                } else if is_printable_unicode(cp) {
669                    if !in_word {
670                        in_word = true;
671                        words += 1;
672                    }
673                }
674                i += 3;
675            } else {
676                // Incomplete → transparent
677                i += 1;
678            }
679        } else if b < 0xF5 {
680            if i + 3 < len
681                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
682                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
683                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
684            {
685                let cp = ((b as u32 & 0x07) << 18)
686                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
687                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
688                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
689                if is_unicode_word_break(cp) {
690                    in_word = false;
691                } else if is_printable_unicode(cp) {
692                    if !in_word {
693                        in_word = true;
694                        words += 1;
695                    }
696                }
697                i += 4;
698            } else {
699                // Incomplete → transparent
700                i += 1;
701            }
702        } else {
703            // Invalid byte >= 0xF5 → transparent
704            i += 1;
705        }
706    }
707
708    (lines, words)
709}
710
711/// Count lines, words, and chars using optimized strategies per locale.
712pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
713    if utf8 {
714        // Fused single-pass for lines+words, then fast char-counting pass
715        let (lines, words) = count_lines_words_utf8_fused(data);
716        let chars = count_chars_utf8(data);
717        (lines, words, chars)
718    } else {
719        // C locale: use optimized fused lines+words, chars = byte count
720        let (lines, words) = count_lines_words(data, false);
721        (lines, words, data.len() as u64)
722    }
723}
724
725/// Count UTF-8 characters by counting non-continuation bytes.
726/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
727/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
728///
729/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
730/// Falls back to 64-byte block processing with popcount on other architectures.
731pub fn count_chars_utf8(data: &[u8]) -> u64 {
732    #[cfg(target_arch = "x86_64")]
733    {
734        if is_x86_feature_detected!("avx2") {
735            return unsafe { count_chars_utf8_avx2(data) };
736        }
737    }
738    count_chars_utf8_scalar(data)
739}
740
741/// AVX2 SIMD character counter: counts non-continuation bytes using
742/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
743/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
744#[cfg(target_arch = "x86_64")]
745#[target_feature(enable = "avx2")]
746unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
747    unsafe {
748        use std::arch::x86_64::*;
749
750        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
751        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
752        let ones = _mm256_set1_epi8(1);
753        let zero = _mm256_setzero_si256();
754
755        let mut total = 0u64;
756        let len = data.len();
757        let ptr = data.as_ptr();
758        let mut i = 0;
759        let mut acc = _mm256_setzero_si256();
760        let mut batch = 0u32;
761
762        while i + 32 <= len {
763            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
764            let masked = _mm256_and_si256(v, mask_c0);
765            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
766            let non_cont = _mm256_andnot_si256(is_cont, ones);
767            acc = _mm256_add_epi8(acc, non_cont);
768
769            batch += 1;
770            if batch >= 255 {
771                // Horizontal sum via PSADBW: sum u8 differences against zero
772                let sad = _mm256_sad_epu8(acc, zero);
773                let hi = _mm256_extracti128_si256(sad, 1);
774                let lo = _mm256_castsi256_si128(sad);
775                let sum = _mm_add_epi64(lo, hi);
776                let hi64 = _mm_unpackhi_epi64(sum, sum);
777                let t = _mm_add_epi64(sum, hi64);
778                total += _mm_cvtsi128_si64(t) as u64;
779                acc = _mm256_setzero_si256();
780                batch = 0;
781            }
782            i += 32;
783        }
784
785        // Final horizontal sum
786        if batch > 0 {
787            let sad = _mm256_sad_epu8(acc, zero);
788            let hi = _mm256_extracti128_si256(sad, 1);
789            let lo = _mm256_castsi256_si128(sad);
790            let sum = _mm_add_epi64(lo, hi);
791            let hi64 = _mm_unpackhi_epi64(sum, sum);
792            let t = _mm_add_epi64(sum, hi64);
793            total += _mm_cvtsi128_si64(t) as u64;
794        }
795
796        while i < len {
797            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
798            i += 1;
799        }
800
801        total
802    }
803}
804
805/// Scalar fallback for count_chars_utf8.
806fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
807    let mut count = 0u64;
808    let chunks = data.chunks_exact(64);
809    let remainder = chunks.remainder();
810
811    for chunk in chunks {
812        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
813        let mut any_high = 0u8;
814        let mut i = 0;
815        while i + 8 <= 64 {
816            unsafe {
817                any_high |= *chunk.get_unchecked(i);
818                any_high |= *chunk.get_unchecked(i + 1);
819                any_high |= *chunk.get_unchecked(i + 2);
820                any_high |= *chunk.get_unchecked(i + 3);
821                any_high |= *chunk.get_unchecked(i + 4);
822                any_high |= *chunk.get_unchecked(i + 5);
823                any_high |= *chunk.get_unchecked(i + 6);
824                any_high |= *chunk.get_unchecked(i + 7);
825            }
826            i += 8;
827        }
828        if any_high < 0x80 {
829            count += 64;
830            continue;
831        }
832
833        let mut char_mask = 0u64;
834        i = 0;
835        while i + 7 < 64 {
836            unsafe {
837                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
838                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
839                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
840                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
841                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
842                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
843                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
844                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
845            }
846            i += 8;
847        }
848        count += char_mask.count_ones() as u64;
849    }
850
851    for &b in remainder {
852        count += ((b & 0xC0) != 0x80) as u64;
853    }
854    count
855}
856
857/// Count characters in C/POSIX locale (each byte is one character).
858#[inline]
859pub fn count_chars_c(data: &[u8]) -> u64 {
860    data.len() as u64
861}
862
863/// Count characters, choosing behavior based on locale.
864#[inline]
865pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
866    if utf8 {
867        count_chars_utf8(data)
868    } else {
869        count_chars_c(data)
870    }
871}
872
873/// Detect if the current locale uses UTF-8 encoding.
874pub fn is_utf8_locale() -> bool {
875    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
876        if let Ok(val) = std::env::var(var) {
877            if !val.is_empty() {
878                let lower = val.to_ascii_lowercase();
879                return lower.contains("utf-8") || lower.contains("utf8");
880            }
881        }
882    }
883    false
884}
885
886/// Decode one UTF-8 character from a byte slice.
887/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
888#[inline]
889fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
890    let b0 = bytes[0];
891    if b0 < 0x80 {
892        return (b0 as u32, 1);
893    }
894    if b0 < 0xC2 {
895        // Continuation byte or overlong 2-byte — invalid as start
896        return (b0 as u32, 1);
897    }
898    if b0 < 0xE0 {
899        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
900            return (b0 as u32, 1);
901        }
902        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
903        return (cp, 2);
904    }
905    if b0 < 0xF0 {
906        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
907            return (b0 as u32, 1);
908        }
909        let cp =
910            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
911        return (cp, 3);
912    }
913    if b0 < 0xF5 {
914        if bytes.len() < 4
915            || bytes[1] & 0xC0 != 0x80
916            || bytes[2] & 0xC0 != 0x80
917            || bytes[3] & 0xC0 != 0x80
918        {
919            return (b0 as u32, 1);
920        }
921        let cp = ((b0 as u32 & 0x07) << 18)
922            | ((bytes[1] as u32 & 0x3F) << 12)
923            | ((bytes[2] as u32 & 0x3F) << 6)
924            | (bytes[3] as u32 & 0x3F);
925        return (cp, 4);
926    }
927    (b0 as u32, 1)
928}
929
930/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
931/// GNU wc uses wcwidth() which returns 0 for these. We must match.
932#[inline]
933fn is_zero_width(cp: u32) -> bool {
934    matches!(
935        cp,
936        0x0300..=0x036F   // Combining Diacritical Marks
937        | 0x0483..=0x0489 // Cyrillic combining marks
938        | 0x0591..=0x05BD // Hebrew combining marks
939        | 0x05BF
940        | 0x05C1..=0x05C2
941        | 0x05C4..=0x05C5
942        | 0x05C7
943        | 0x0600..=0x0605 // Arabic number signs
944        | 0x0610..=0x061A // Arabic combining marks
945        | 0x064B..=0x065F // Arabic combining marks
946        | 0x0670
947        | 0x06D6..=0x06DD
948        | 0x06DF..=0x06E4
949        | 0x06E7..=0x06E8
950        | 0x06EA..=0x06ED
951        | 0x070F
952        | 0x0711
953        | 0x0730..=0x074A
954        | 0x07A6..=0x07B0
955        | 0x07EB..=0x07F3
956        | 0x07FD
957        | 0x0816..=0x0819
958        | 0x081B..=0x0823
959        | 0x0825..=0x0827
960        | 0x0829..=0x082D
961        | 0x0859..=0x085B
962        | 0x08D3..=0x08E1
963        | 0x08E3..=0x0902
964        | 0x093A
965        | 0x093C
966        | 0x0941..=0x0948
967        | 0x094D
968        | 0x0951..=0x0957
969        | 0x0962..=0x0963
970        | 0x0981
971        | 0x09BC
972        | 0x09C1..=0x09C4
973        | 0x09CD
974        | 0x09E2..=0x09E3
975        | 0x09FE
976        | 0x0A01..=0x0A02
977        | 0x0A3C
978        | 0x0A41..=0x0A42
979        | 0x0A47..=0x0A48
980        | 0x0A4B..=0x0A4D
981        | 0x0A51
982        | 0x0A70..=0x0A71
983        | 0x0A75
984        | 0x0A81..=0x0A82
985        | 0x0ABC
986        | 0x0AC1..=0x0AC5
987        | 0x0AC7..=0x0AC8
988        | 0x0ACD
989        | 0x0AE2..=0x0AE3
990        | 0x0AFA..=0x0AFF
991        | 0x0B01
992        | 0x0B3C
993        | 0x0B3F
994        | 0x0B41..=0x0B44
995        | 0x0B4D
996        | 0x0B56
997        | 0x0B62..=0x0B63
998        | 0x0B82
999        | 0x0BC0
1000        | 0x0BCD
1001        | 0x0C00
1002        | 0x0C04
1003        | 0x0C3E..=0x0C40
1004        | 0x0C46..=0x0C48
1005        | 0x0C4A..=0x0C4D
1006        | 0x0C55..=0x0C56
1007        | 0x0C62..=0x0C63
1008        | 0x0C81
1009        | 0x0CBC
1010        | 0x0CBF
1011        | 0x0CC6
1012        | 0x0CCC..=0x0CCD
1013        | 0x0CE2..=0x0CE3
1014        | 0x0D00..=0x0D01
1015        | 0x0D3B..=0x0D3C
1016        | 0x0D41..=0x0D44
1017        | 0x0D4D
1018        | 0x0D62..=0x0D63
1019        | 0x0DCA
1020        | 0x0DD2..=0x0DD4
1021        | 0x0DD6
1022        | 0x0E31
1023        | 0x0E34..=0x0E3A
1024        | 0x0E47..=0x0E4E
1025        | 0x0EB1
1026        | 0x0EB4..=0x0EBC
1027        | 0x0EC8..=0x0ECD
1028        | 0x0F18..=0x0F19
1029        | 0x0F35
1030        | 0x0F37
1031        | 0x0F39
1032        | 0x0F71..=0x0F7E
1033        | 0x0F80..=0x0F84
1034        | 0x0F86..=0x0F87
1035        | 0x0F8D..=0x0F97
1036        | 0x0F99..=0x0FBC
1037        | 0x0FC6
1038        | 0x102D..=0x1030
1039        | 0x1032..=0x1037
1040        | 0x1039..=0x103A
1041        | 0x103D..=0x103E
1042        | 0x1058..=0x1059
1043        | 0x105E..=0x1060
1044        | 0x1071..=0x1074
1045        | 0x1082
1046        | 0x1085..=0x1086
1047        | 0x108D
1048        | 0x109D
1049        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
1050        | 0x135D..=0x135F
1051        | 0x1712..=0x1714
1052        | 0x1732..=0x1734
1053        | 0x1752..=0x1753
1054        | 0x1772..=0x1773
1055        | 0x17B4..=0x17B5
1056        | 0x17B7..=0x17BD
1057        | 0x17C6
1058        | 0x17C9..=0x17D3
1059        | 0x17DD
1060        | 0x180B..=0x180D
1061        | 0x1885..=0x1886
1062        | 0x18A9
1063        | 0x1920..=0x1922
1064        | 0x1927..=0x1928
1065        | 0x1932
1066        | 0x1939..=0x193B
1067        | 0x1A17..=0x1A18
1068        | 0x1A1B
1069        | 0x1A56
1070        | 0x1A58..=0x1A5E
1071        | 0x1A60
1072        | 0x1A62
1073        | 0x1A65..=0x1A6C
1074        | 0x1A73..=0x1A7C
1075        | 0x1A7F
1076        | 0x1AB0..=0x1ABE
1077        | 0x1B00..=0x1B03
1078        | 0x1B34
1079        | 0x1B36..=0x1B3A
1080        | 0x1B3C
1081        | 0x1B42
1082        | 0x1B6B..=0x1B73
1083        | 0x1B80..=0x1B81
1084        | 0x1BA2..=0x1BA5
1085        | 0x1BA8..=0x1BA9
1086        | 0x1BAB..=0x1BAD
1087        | 0x1BE6
1088        | 0x1BE8..=0x1BE9
1089        | 0x1BED
1090        | 0x1BEF..=0x1BF1
1091        | 0x1C2C..=0x1C33
1092        | 0x1C36..=0x1C37
1093        | 0x1CD0..=0x1CD2
1094        | 0x1CD4..=0x1CE0
1095        | 0x1CE2..=0x1CE8
1096        | 0x1CED
1097        | 0x1CF4
1098        | 0x1CF8..=0x1CF9
1099        | 0x1DC0..=0x1DF9
1100        | 0x1DFB..=0x1DFF
1101        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
1102        | 0x202A..=0x202E // Bidi control chars
1103        | 0x2060..=0x2064 // Word joiner, invisible operators
1104        | 0x2066..=0x206F // Bidi isolates
1105        | 0x20D0..=0x20F0 // Combining marks for symbols
1106        | 0xFE00..=0xFE0F // Variation Selectors
1107        | 0xFE20..=0xFE2F // Combining Half Marks
1108        | 0xFEFF          // Zero Width No-Break Space (BOM)
1109        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
1110        | 0x1D167..=0x1D169
1111        | 0x1D173..=0x1D182
1112        | 0x1D185..=0x1D18B
1113        | 0x1D1AA..=0x1D1AD
1114        | 0x1D242..=0x1D244
1115        | 0xE0001
1116        | 0xE0020..=0xE007F
1117        | 0xE0100..=0xE01EF // Variation Selectors Supplement
1118    )
1119}
1120
1121/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
1122/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
1123#[inline]
1124fn is_wide_char(cp: u32) -> bool {
1125    matches!(
1126        cp,
1127        0x1100..=0x115F   // Hangul Jamo
1128        | 0x231A..=0x231B // Watch, Hourglass
1129        | 0x2329..=0x232A // Angle Brackets
1130        | 0x23E9..=0x23F3 // Various symbols
1131        | 0x23F8..=0x23FA
1132        | 0x25FD..=0x25FE
1133        | 0x2614..=0x2615
1134        | 0x2648..=0x2653
1135        | 0x267F
1136        | 0x2693
1137        | 0x26A1
1138        | 0x26AA..=0x26AB
1139        | 0x26BD..=0x26BE
1140        | 0x26C4..=0x26C5
1141        | 0x26CE
1142        | 0x26D4
1143        | 0x26EA
1144        | 0x26F2..=0x26F3
1145        | 0x26F5
1146        | 0x26FA
1147        | 0x26FD
1148        | 0x2702
1149        | 0x2705
1150        | 0x2708..=0x270D
1151        | 0x270F
1152        | 0x2712
1153        | 0x2714
1154        | 0x2716
1155        | 0x271D
1156        | 0x2721
1157        | 0x2728
1158        | 0x2733..=0x2734
1159        | 0x2744
1160        | 0x2747
1161        | 0x274C
1162        | 0x274E
1163        | 0x2753..=0x2755
1164        | 0x2757
1165        | 0x2763..=0x2764
1166        | 0x2795..=0x2797
1167        | 0x27A1
1168        | 0x27B0
1169        | 0x27BF
1170        | 0x2934..=0x2935
1171        | 0x2B05..=0x2B07
1172        | 0x2B1B..=0x2B1C
1173        | 0x2B50
1174        | 0x2B55
1175        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
1176        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
1177        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1178        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1179        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1180        | 0xAC00..=0xD7A3  // Hangul Syllables
1181        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1182        | 0xFE10..=0xFE19  // Vertical Forms
1183        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1184        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1185        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1186        | 0x1F004
1187        | 0x1F0CF
1188        | 0x1F170..=0x1F171
1189        | 0x1F17E..=0x1F17F
1190        | 0x1F18E
1191        | 0x1F191..=0x1F19A
1192        | 0x1F1E0..=0x1F1FF // Regional Indicators
1193        | 0x1F200..=0x1F202
1194        | 0x1F210..=0x1F23B
1195        | 0x1F240..=0x1F248
1196        | 0x1F250..=0x1F251
1197        | 0x1F260..=0x1F265
1198        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1199        | 0x1F680..=0x1F6FF // Transport Symbols
1200        | 0x1F900..=0x1F9FF // Supplemental Symbols
1201        | 0x1FA00..=0x1FA6F
1202        | 0x1FA70..=0x1FAFF
1203        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1204        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1205    )
1206}
1207
1208/// Compute maximum display width of any line (C/POSIX locale).
1209///
1210/// GNU wc -L behavior in C locale:
1211/// - `\n`: line terminator (records max, resets position)
1212/// - `\t`: advances to next tab stop (multiple of 8)
1213/// - `\r`: carriage return (resets position to 0, same line)
1214/// - `\f`: form feed (acts as line terminator like \n)
1215/// - Printable ASCII (0x20..0x7E): width 1
1216/// - Everything else (controls, high bytes): width 0
1217///
1218/// Optimized with printable ASCII run counting: for runs of bytes in
1219/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1220pub fn max_line_length_c(data: &[u8]) -> u64 {
1221    let mut max_len: u64 = 0;
1222    let mut line_len: u64 = 0;
1223    let mut linepos: u64 = 0;
1224    let mut i = 0;
1225    let len = data.len();
1226
1227    while i < len {
1228        let b = unsafe { *data.get_unchecked(i) };
1229        if b >= 0x21 && b <= 0x7E {
1230            // Printable non-space ASCII — count run length
1231            i += 1;
1232            let mut run = 1u64;
1233            while i < len {
1234                let b = unsafe { *data.get_unchecked(i) };
1235                if b >= 0x21 && b <= 0x7E {
1236                    run += 1;
1237                    i += 1;
1238                } else {
1239                    break;
1240                }
1241            }
1242            linepos += run;
1243            if linepos > line_len {
1244                line_len = linepos;
1245            }
1246        } else {
1247            match b {
1248                b' ' => {
1249                    linepos += 1;
1250                    if linepos > line_len {
1251                        line_len = linepos;
1252                    }
1253                }
1254                b'\n' => {
1255                    if line_len > max_len {
1256                        max_len = line_len;
1257                    }
1258                    linepos = 0;
1259                    line_len = 0;
1260                }
1261                b'\t' => {
1262                    linepos = (linepos + 8) & !7;
1263                    if linepos > line_len {
1264                        line_len = linepos;
1265                    }
1266                }
1267                b'\r' => {
1268                    linepos = 0;
1269                }
1270                0x0C => {
1271                    if line_len > max_len {
1272                        max_len = line_len;
1273                    }
1274                    linepos = 0;
1275                    line_len = 0;
1276                }
1277                _ => {} // Non-printable: width 0
1278            }
1279            i += 1;
1280        }
1281    }
1282
1283    if line_len > max_len {
1284        max_len = line_len;
1285    }
1286
1287    max_len
1288}
1289
1290/// Compute maximum display width of any line (UTF-8 locale).
1291///
1292/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1293/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1294///
1295/// Optimized with printable ASCII run counting for common text.
1296pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1297    let mut max_len: u64 = 0;
1298    let mut line_len: u64 = 0;
1299    let mut linepos: u64 = 0;
1300    let mut i = 0;
1301    let len = data.len();
1302
1303    while i < len {
1304        let b = unsafe { *data.get_unchecked(i) };
1305
1306        if b >= 0x21 && b <= 0x7E {
1307            // Printable non-space ASCII (most common) — count run length
1308            i += 1;
1309            let mut run = 1u64;
1310            while i < len {
1311                let b = unsafe { *data.get_unchecked(i) };
1312                if b >= 0x21 && b <= 0x7E {
1313                    run += 1;
1314                    i += 1;
1315                } else {
1316                    break;
1317                }
1318            }
1319            linepos += run;
1320            if linepos > line_len {
1321                line_len = linepos;
1322            }
1323        } else if b < 0x80 {
1324            // Other ASCII: space, tab, newline, controls
1325            match b {
1326                b' ' => {
1327                    linepos += 1;
1328                    if linepos > line_len {
1329                        line_len = linepos;
1330                    }
1331                }
1332                b'\n' => {
1333                    if line_len > max_len {
1334                        max_len = line_len;
1335                    }
1336                    linepos = 0;
1337                    line_len = 0;
1338                }
1339                b'\t' => {
1340                    linepos = (linepos + 8) & !7;
1341                    if linepos > line_len {
1342                        line_len = linepos;
1343                    }
1344                }
1345                b'\r' => {
1346                    linepos = 0;
1347                }
1348                0x0C => {
1349                    if line_len > max_len {
1350                        max_len = line_len;
1351                    }
1352                    linepos = 0;
1353                    line_len = 0;
1354                }
1355                _ => {} // Non-printable: width 0
1356            }
1357            i += 1;
1358        } else {
1359            // Multibyte UTF-8
1360            let (cp, blen) = decode_utf8(&data[i..]);
1361
1362            // C1 control characters (0x80..0x9F): non-printable, width 0
1363            if cp <= 0x9F {
1364                // width 0
1365            } else if is_zero_width(cp) {
1366                // Combining marks, zero-width chars: width 0
1367            } else if is_wide_char(cp) {
1368                linepos += 2;
1369                if linepos > line_len {
1370                    line_len = linepos;
1371                }
1372            } else {
1373                // Regular printable Unicode character: width 1
1374                linepos += 1;
1375                if linepos > line_len {
1376                    line_len = linepos;
1377                }
1378            }
1379            i += blen;
1380        }
1381    }
1382
1383    // Handle last line
1384    if line_len > max_len {
1385        max_len = line_len;
1386    }
1387
1388    max_len
1389}
1390
1391/// Compute maximum display width, choosing behavior based on locale.
1392#[inline]
1393pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1394    if utf8 {
1395        max_line_length_utf8(data)
1396    } else {
1397        max_line_length_c(data)
1398    }
1399}
1400
1401/// Count all metrics using optimized individual passes.
1402///
1403/// Each metric uses its own optimized algorithm:
1404/// - Lines: SIMD-accelerated memchr
1405/// - Words: 2-state scalar/state-machine (locale-dependent)
1406/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1407/// - Max line length: locale-aware display width tracking
1408///
1409/// Multi-pass is faster than single-pass because each pass has a tight,
1410/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1411/// making subsequent passes nearly free for memory bandwidth.
1412pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1413    if utf8 {
1414        let (lines, words) = count_lines_words_utf8_fused(data);
1415        WcCounts {
1416            lines,
1417            words,
1418            bytes: data.len() as u64,
1419            chars: count_chars_utf8(data),
1420            max_line_length: max_line_length_utf8(data),
1421        }
1422    } else {
1423        WcCounts {
1424            lines: count_lines(data),
1425            words: count_words_locale(data, false),
1426            bytes: data.len() as u64,
1427            chars: data.len() as u64,
1428            max_line_length: max_line_length_c(data),
1429        }
1430    }
1431}
1432
1433/// Quick check if data is likely all-ASCII by sampling three regions.
1434/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1435/// If any byte >= 0x80 is found, returns false.
1436#[inline]
1437fn check_ascii_sample(data: &[u8]) -> bool {
1438    let len = data.len();
1439    if len == 0 {
1440        return true;
1441    }
1442
1443    // Check in 8-byte blocks using OR-accumulation for speed
1444    let check_region = |start: usize, end: usize| -> bool {
1445        let mut or_acc = 0u8;
1446        let region = &data[start..end];
1447        let mut i = 0;
1448        while i + 8 <= region.len() {
1449            unsafe {
1450                or_acc |= *region.get_unchecked(i);
1451                or_acc |= *region.get_unchecked(i + 1);
1452                or_acc |= *region.get_unchecked(i + 2);
1453                or_acc |= *region.get_unchecked(i + 3);
1454                or_acc |= *region.get_unchecked(i + 4);
1455                or_acc |= *region.get_unchecked(i + 5);
1456                or_acc |= *region.get_unchecked(i + 6);
1457                or_acc |= *region.get_unchecked(i + 7);
1458            }
1459            i += 8;
1460        }
1461        while i < region.len() {
1462            or_acc |= region[i];
1463            i += 1;
1464        }
1465        or_acc < 0x80
1466    };
1467
1468    let sample = 256.min(len);
1469
1470    // Check beginning
1471    if !check_region(0, sample) {
1472        return false;
1473    }
1474    // Check middle
1475    if len > sample * 2 {
1476        let mid = len / 2;
1477        let mid_start = mid.saturating_sub(sample / 2);
1478        if !check_region(mid_start, (mid_start + sample).min(len)) {
1479            return false;
1480        }
1481    }
1482    // Check end
1483    if len > sample {
1484        if !check_region(len - sample, len) {
1485            return false;
1486        }
1487    }
1488
1489    true
1490}
1491
1492// ──────────────────────────────────────────────────
1493// Parallel counting for large files
1494// ──────────────────────────────────────────────────
1495
1496/// Split data into chunks at newline boundaries for parallel processing.
1497/// Returns slices where each slice (except possibly the last) ends with `\n`.
1498/// Splitting at newlines guarantees word boundaries in any locale,
1499/// enabling safe parallel word counting without boundary adjustment.
1500fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1501    if data.is_empty() || num_chunks <= 1 {
1502        return vec![data];
1503    }
1504    let chunk_size = data.len() / num_chunks;
1505    let mut chunks = Vec::with_capacity(num_chunks);
1506    let mut pos = 0;
1507
1508    for _ in 0..num_chunks - 1 {
1509        let target = pos + chunk_size;
1510        if target >= data.len() {
1511            break;
1512        }
1513        let boundary = memchr::memchr(b'\n', &data[target..])
1514            .map(|p| target + p + 1)
1515            .unwrap_or(data.len());
1516        if boundary > pos {
1517            chunks.push(&data[pos..boundary]);
1518        }
1519        pos = boundary;
1520    }
1521    if pos < data.len() {
1522        chunks.push(&data[pos..]);
1523    }
1524    chunks
1525}
1526
1527/// Count newlines in parallel using SIMD memchr + rayon.
1528/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1529pub fn count_lines_parallel(data: &[u8]) -> u64 {
1530    if data.len() < PARALLEL_THRESHOLD {
1531        return count_lines(data);
1532    }
1533
1534    let num_threads = rayon::current_num_threads().max(1);
1535    // Ensure chunks are large enough to amortize SIMD setup overhead
1536    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1537
1538    data.par_chunks(chunk_size)
1539        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1540        .sum()
1541}
1542
1543/// Count words in parallel with boundary adjustment.
1544pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1545    if data.len() < PARALLEL_THRESHOLD {
1546        return count_words_locale(data, utf8);
1547    }
1548
1549    let num_threads = rayon::current_num_threads().max(1);
1550
1551    if utf8 {
1552        // UTF-8: split at newline boundaries for safe parallel word counting.
1553        // Newlines are always word boundaries, so no boundary adjustment needed.
1554        let chunks = split_at_newlines(data, num_threads);
1555        chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1556    } else {
1557        // C locale: parallel 2-state word counting with boundary adjustment
1558        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1559
1560        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1561
1562        // Each chunk returns (lines, word_count, first_is_word, ends_in_word)
1563        let results: Vec<(u64, u64, bool, bool)> = chunks
1564            .par_iter()
1565            .map(|chunk| count_lw_c_chunk(chunk))
1566            .collect();
1567
1568        let mut total = 0u64;
1569        for i in 0..results.len() {
1570            total += results[i].1;
1571            // Boundary adjustment: if previous chunk ended in_word AND
1572            // current chunk's first byte is non-space (word content),
1573            // the word was split across chunks — subtract the overcount.
1574            if i > 0 && results[i - 1].3 && results[i].2 {
1575                total -= 1;
1576            }
1577        }
1578        total
1579    }
1580}
1581
1582/// Count UTF-8 characters in parallel.
1583pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1584    if !utf8 {
1585        return data.len() as u64;
1586    }
1587    if data.len() < PARALLEL_THRESHOLD {
1588        return count_chars_utf8(data);
1589    }
1590
1591    let num_threads = rayon::current_num_threads().max(1);
1592    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1593
1594    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1595}
1596
1597/// Count lines + words + bytes in a single fused pass (the default wc mode).
1598/// Avoids separate passes entirely — combines newline counting with word detection.
1599pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1600    let (lines, words) = count_lines_words(data, utf8);
1601    (lines, words, data.len() as u64)
1602}
1603
1604/// Parallel counting of lines + words + bytes only (no chars).
1605/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1606/// C locale: single fused pass per chunk counts BOTH lines and words.
1607/// UTF-8: checks ASCII first for C locale fast path, else splits at newlines
1608/// for safe parallel UTF-8 word counting.
1609pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1610    if data.len() < PARALLEL_THRESHOLD {
1611        // Small file: use fused single-pass
1612        return count_lwb(data, utf8);
1613    }
1614
1615    let num_threads = rayon::current_num_threads().max(1);
1616
1617    let (lines, words) = if !utf8 {
1618        // C locale: FUSED parallel lines+words counting — single pass per chunk
1619        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1620
1621        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1622        let results: Vec<(u64, u64, bool, bool)> = chunks
1623            .par_iter()
1624            .map(|chunk| count_lw_c_chunk_fast(chunk))
1625            .collect();
1626
1627        let mut line_total = 0u64;
1628        let mut word_total = 0u64;
1629        for i in 0..results.len() {
1630            line_total += results[i].0;
1631            word_total += results[i].1;
1632            if i > 0 && results[i - 1].3 && results[i].2 {
1633                word_total -= 1;
1634            }
1635        }
1636
1637        (line_total, word_total)
1638    } else {
1639        // UTF-8 locale: check if ASCII for faster C locale path
1640        let is_ascii = check_ascii_sample(data);
1641        if is_ascii {
1642            // Pure ASCII: use C locale parallel path (arbitrary chunks OK)
1643            let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1644            let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1645            let results: Vec<(u64, u64, bool, bool)> = chunks
1646                .par_iter()
1647                .map(|chunk| count_lw_c_chunk_fast(chunk))
1648                .collect();
1649
1650            let mut line_total = 0u64;
1651            let mut word_total = 0u64;
1652            for i in 0..results.len() {
1653                line_total += results[i].0;
1654                word_total += results[i].1;
1655                if i > 0 && results[i - 1].3 && results[i].2 {
1656                    word_total -= 1;
1657                }
1658            }
1659            (line_total, word_total)
1660        } else {
1661            // Non-ASCII UTF-8: split at newline boundaries for safe parallel
1662            // word counting. Newlines always break words, so no adjustment needed.
1663            let chunks = split_at_newlines(data, num_threads);
1664            let results: Vec<(u64, u64)> = chunks
1665                .par_iter()
1666                .map(|chunk| count_lines_words_utf8_fused(chunk))
1667                .collect();
1668            let mut line_total = 0u64;
1669            let mut word_total = 0u64;
1670            for (l, w) in results {
1671                line_total += l;
1672                word_total += w;
1673            }
1674            (line_total, word_total)
1675        }
1676    };
1677
1678    (lines, words, data.len() as u64)
1679}
1680
1681/// Combined parallel counting of lines + words + chars.
1682/// UTF-8: splits at newline boundaries for fused lines+words+chars per chunk.
1683/// C locale: fused parallel lines+words with boundary adjustment + parallel chars.
1684pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1685    if data.len() < PARALLEL_THRESHOLD {
1686        let lines = count_lines(data);
1687        let words = count_words_locale(data, utf8);
1688        let chars = count_chars(data, utf8);
1689        return (lines, words, chars);
1690    }
1691
1692    let num_threads = rayon::current_num_threads().max(1);
1693
1694    if utf8 {
1695        // UTF-8: fused parallel lines+words+chars per chunk (split at newlines)
1696        let chunks = split_at_newlines(data, num_threads);
1697        let results: Vec<(u64, u64, u64)> = chunks
1698            .par_iter()
1699            .map(|chunk| {
1700                let (lines, words) = count_lines_words_utf8_fused(chunk);
1701                let chars = count_chars_utf8(chunk);
1702                (lines, words, chars)
1703            })
1704            .collect();
1705        let mut lines = 0u64;
1706        let mut words = 0u64;
1707        let mut chars = 0u64;
1708        for (l, w, c) in results {
1709            lines += l;
1710            words += w;
1711            chars += c;
1712        }
1713        (lines, words, chars)
1714    } else {
1715        // C locale: fused parallel lines+words + parallel chars (= byte count)
1716        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1717        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1718        let results: Vec<(u64, u64, bool, bool)> = chunks
1719            .par_iter()
1720            .map(|chunk| count_lw_c_chunk_fast(chunk))
1721            .collect();
1722        let mut lines = 0u64;
1723        let mut words = 0u64;
1724        for i in 0..results.len() {
1725            lines += results[i].0;
1726            words += results[i].1;
1727            if i > 0 && results[i - 1].3 && results[i].2 {
1728                words -= 1;
1729            }
1730        }
1731        (lines, words, data.len() as u64)
1732    }
1733}
1734
1735/// Parallel max line length computation.
1736/// Splits at newline boundaries so each chunk independently computes correct
1737/// max line width (since newlines reset position tracking).
1738pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1739    if data.len() < PARALLEL_THRESHOLD {
1740        return max_line_length(data, utf8);
1741    }
1742    let num_threads = rayon::current_num_threads().max(1);
1743    let chunks = split_at_newlines(data, num_threads);
1744    chunks
1745        .par_iter()
1746        .map(|chunk| {
1747            if utf8 {
1748                max_line_length_utf8(chunk)
1749            } else {
1750                max_line_length_c(chunk)
1751            }
1752        })
1753        .max()
1754        .unwrap_or(0)
1755}
1756
1757/// Parallel counting of all metrics at once.
1758/// Splits at newline boundaries for safe parallel word + max_line_length counting.
1759/// Each chunk computes all metrics in a single traversal group, maximizing cache reuse.
1760pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1761    if data.len() < PARALLEL_THRESHOLD {
1762        return count_all(data, utf8);
1763    }
1764
1765    let num_threads = rayon::current_num_threads().max(1);
1766    let chunks = split_at_newlines(data, num_threads);
1767
1768    if utf8 {
1769        let results: Vec<(u64, u64, u64, u64)> = chunks
1770            .par_iter()
1771            .map(|chunk| {
1772                let (lines, words) = count_lines_words_utf8_fused(chunk);
1773                let chars = count_chars_utf8(chunk);
1774                let max_ll = max_line_length_utf8(chunk);
1775                (lines, words, chars, max_ll)
1776            })
1777            .collect();
1778
1779        let mut counts = WcCounts {
1780            bytes: data.len() as u64,
1781            ..Default::default()
1782        };
1783        for (l, w, c, m) in results {
1784            counts.lines += l;
1785            counts.words += w;
1786            counts.chars += c;
1787            if m > counts.max_line_length {
1788                counts.max_line_length = m;
1789            }
1790        }
1791        counts
1792    } else {
1793        // C locale: fused lines+words per chunk + max_line_length per chunk
1794        let results: Vec<(u64, u64, u64)> = chunks
1795            .par_iter()
1796            .map(|chunk| {
1797                let (lines, words) = count_lines_words(chunk, false);
1798                let max_ll = max_line_length_c(chunk);
1799                (lines, words, max_ll)
1800            })
1801            .collect();
1802
1803        let mut counts = WcCounts {
1804            bytes: data.len() as u64,
1805            chars: data.len() as u64,
1806            ..Default::default()
1807        };
1808        for (l, w, m) in &results {
1809            counts.lines += l;
1810            counts.words += w;
1811            if *m > counts.max_line_length {
1812                counts.max_line_length = *m;
1813            }
1814        }
1815        counts
1816    }
1817}