Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (1MB).
5/// Rayon overhead is ~5-10μs per task; at 1MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~100μs, so overhead is < 10%.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// 3-state byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// GNU wc uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
24//   0 = printable (word content): starts or continues a word
25//   1 = space (word break): ends any current word
26//   2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
27//
28// The critical difference from 2-state is that transparent characters
29// (NUL, control chars, invalid UTF-8) do NOT break words.
30// Example: "hello\x00world" is 1 word (NUL is transparent), not 2.
31
32/// 3-state byte classification for C/POSIX locale.
33/// In C locale, mbrtowc() fails for bytes >= 0x80, making them transparent.
34/// Only printable ASCII (0x21-0x7E) forms words.
35const fn make_byte_class_c() -> [u8; 256] {
36    let mut t = [2u8; 256]; // default: transparent
37    // Spaces: iswspace() returns true
38    t[0x09] = 1; // \t
39    t[0x0A] = 1; // \n
40    t[0x0B] = 1; // \v
41    t[0x0C] = 1; // \f
42    t[0x0D] = 1; // \r
43    t[0x20] = 1; // space
44    // GNU compat: null byte is treated as printable (word content) in C locale.
45    // mbrtowc() returns L'\0' for the null byte, and GNU wc treats it as
46    // a non-space printable character that starts/continues words.
47    t[0x00] = 0;
48    // Printable ASCII (0x21-0x7E): word content
49    let mut i = 0x21u16;
50    while i <= 0x7E {
51        t[i as usize] = 0;
52        i += 1;
53    }
54    t
55}
56
57const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
58
59/// 3-state single-byte classification for UTF-8 locale.
60/// Multi-byte UTF-8 sequences are handled by the state machine separately.
61const fn make_byte_class_utf8() -> [u8; 256] {
62    let mut t = [2u8; 256]; // default: transparent
63    // Spaces
64    t[0x09] = 1; // \t
65    t[0x0A] = 1; // \n
66    t[0x0B] = 1; // \v
67    t[0x0C] = 1; // \f
68    t[0x0D] = 1; // \r
69    t[0x20] = 1; // space
70    // Printable ASCII (0x21-0x7E): word content
71    let mut i = 0x21u16;
72    while i <= 0x7E {
73        t[i as usize] = 0;
74        i += 1;
75    }
76    t
77}
78
79const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
80
81// ──────────────────────────────────────────────────
82// Unicode character classification helpers
83// ──────────────────────────────────────────────────
84
85/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
86/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
87#[inline]
88fn is_unicode_space(cp: u32) -> bool {
89    matches!(
90        cp,
91        0x00A0 |           // No-Break Space
92        0x1680 |           // Ogham Space Mark
93        0x2000
94            ..=0x200A |  // En Quad through Hair Space
95        0x2028 |           // Line Separator
96        0x2029 |           // Paragraph Separator
97        0x202F |           // Narrow No-Break Space
98        0x205F |           // Medium Mathematical Space
99        0x3000 // Ideographic Space
100    )
101}
102
103/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
104/// C1 control characters (U+0080-U+009F) are not printable.
105/// Most characters >= U+00A0 are printable.
106#[inline]
107fn is_unicode_printable(cp: u32) -> bool {
108    cp >= 0xA0
109}
110
111// ──────────────────────────────────────────────────
112// Core counting functions
113// ──────────────────────────────────────────────────
114
115/// Count newlines using SIMD-accelerated memchr.
116/// GNU wc counts newline bytes (`\n`), not logical lines.
117#[inline]
118pub fn count_lines(data: &[u8]) -> u64 {
119    memchr_iter(b'\n', data).count() as u64
120}
121
122/// Count bytes. Trivial but included for API consistency.
123#[inline]
124pub fn count_bytes(data: &[u8]) -> u64 {
125    data.len() as u64
126}
127
128/// Count words using locale-aware 3-state logic (default: UTF-8).
129pub fn count_words(data: &[u8]) -> u64 {
130    count_words_locale(data, true)
131}
132
133/// Count words with explicit locale control using 3-state logic.
134///
135/// GNU wc classifies each character as:
136///   - space (iswspace=true): sets in_word=false
137///   - printable (iswprint=true): sets in_word=true, increments word count on transition
138///   - transparent (neither): leaves in_word unchanged
139pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
140    if utf8 {
141        count_words_utf8(data)
142    } else {
143        count_words_c(data)
144    }
145}
146
147/// Count words in C/POSIX locale using 3-state scalar logic.
148/// Only printable ASCII (0x21-0x7E) forms words.
149/// Bytes >= 0x80 and non-printable ASCII controls are transparent.
150///
151/// Optimized with ASCII run skipping for printable characters.
152fn count_words_c(data: &[u8]) -> u64 {
153    let mut words = 0u64;
154    let mut in_word = false;
155    let mut i = 0;
156    let len = data.len();
157
158    while i < len {
159        let b = unsafe { *data.get_unchecked(i) };
160        if b >= 0x21 && b <= 0x7E {
161            // Printable ASCII — word content
162            if !in_word {
163                in_word = true;
164                words += 1;
165            }
166            i += 1;
167            // Skip remaining printable ASCII
168            while i < len {
169                let b = unsafe { *data.get_unchecked(i) };
170                if b >= 0x21 && b <= 0x7E {
171                    i += 1;
172                } else {
173                    break;
174                }
175            }
176        } else {
177            let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
178            if class == 1 {
179                in_word = false;
180            } else if class == 0 {
181                // NUL is printable in C locale — starts/continues word
182                if !in_word {
183                    in_word = true;
184                    words += 1;
185                }
186            }
187            // class == 2: transparent — in_word unchanged
188            i += 1;
189        }
190    }
191    words
192}
193
194/// AVX2-accelerated fused line+word counter for C locale chunks.
195/// Processes 32 bytes per iteration: classifies word characters via signed
196/// comparison (byte > 0x20), counts word-start transitions via bitmask,
197/// counts newlines via SIMD accumulation with periodic horizontal sum.
198/// ~3x faster than scalar for word counting on ASCII text.
199#[cfg(target_arch = "x86_64")]
200#[target_feature(enable = "avx2")]
201unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
202    use std::arch::x86_64::*;
203
204    let len = data.len();
205    let ptr = data.as_ptr();
206    let mut i = 0usize;
207    let mut total_lines = 0u64;
208    let mut total_words = 0u64;
209    let mut prev_was_word = false;
210
211    unsafe {
212        let space_thr = _mm256_set1_epi8(0x20i8);
213        let nl_byte = _mm256_set1_epi8(b'\n' as i8);
214        let zero = _mm256_setzero_si256();
215        let ones = _mm256_set1_epi8(1);
216
217        let mut line_acc = _mm256_setzero_si256();
218        let mut batch = 0u32;
219
220        while i + 32 <= len {
221            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
222            let is_word = _mm256_cmpgt_epi8(v, space_thr);
223            let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
224            line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
225
226            let word_mask = _mm256_movemask_epi8(is_word) as u32;
227            let prev_mask = (word_mask << 1) | (prev_was_word as u32);
228            total_words += (word_mask & !prev_mask).count_ones() as u64;
229            prev_was_word = (word_mask >> 31) & 1 == 1;
230
231            batch += 1;
232            if batch >= 255 {
233                let sad = _mm256_sad_epu8(line_acc, zero);
234                let hi = _mm256_extracti128_si256(sad, 1);
235                let lo = _mm256_castsi256_si128(sad);
236                let s = _mm_add_epi64(lo, hi);
237                let h64 = _mm_unpackhi_epi64(s, s);
238                let t = _mm_add_epi64(s, h64);
239                total_lines += _mm_cvtsi128_si64(t) as u64;
240                line_acc = _mm256_setzero_si256();
241                batch = 0;
242            }
243            i += 32;
244        }
245
246        if batch > 0 {
247            let sad = _mm256_sad_epu8(line_acc, zero);
248            let hi = _mm256_extracti128_si256(sad, 1);
249            let lo = _mm256_castsi256_si128(sad);
250            let s = _mm_add_epi64(lo, hi);
251            let h64 = _mm_unpackhi_epi64(s, s);
252            let t = _mm_add_epi64(s, h64);
253            total_lines += _mm_cvtsi128_si64(t) as u64;
254        }
255
256        // Scalar tail
257        while i < len {
258            let b = *ptr.add(i);
259            if b == b'\n' {
260                total_lines += 1;
261                prev_was_word = false;
262            } else if b > 0x20 {
263                if !prev_was_word {
264                    total_words += 1;
265                }
266                prev_was_word = true;
267            } else {
268                prev_was_word = false;
269            }
270            i += 1;
271        }
272    }
273
274    let first_is_word = !data.is_empty() && data[0] > 0x20;
275    (total_lines, total_words, first_is_word, prev_was_word)
276}
277
278/// SSE2-accelerated fused line+word counter for C locale chunks.
279/// Same algorithm as AVX2 but processes 16 bytes per iteration.
280/// Available on all x86_64 CPUs (SSE2 is baseline for x86_64).
281#[cfg(target_arch = "x86_64")]
282#[target_feature(enable = "sse2")]
283unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
284    use std::arch::x86_64::*;
285
286    let len = data.len();
287    let ptr = data.as_ptr();
288    let mut i = 0usize;
289    let mut total_lines = 0u64;
290    let mut total_words = 0u64;
291    let mut prev_was_word = false;
292
293    unsafe {
294        let space_thr = _mm_set1_epi8(0x20i8);
295        let nl_byte = _mm_set1_epi8(b'\n' as i8);
296        let zero = _mm_setzero_si128();
297        let ones = _mm_set1_epi8(1);
298
299        let mut line_acc = _mm_setzero_si128();
300        let mut batch = 0u32;
301
302        while i + 16 <= len {
303            let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
304            let is_word = _mm_cmpgt_epi8(v, space_thr);
305            let is_nl = _mm_cmpeq_epi8(v, nl_byte);
306            line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
307
308            let word_mask = _mm_movemask_epi8(is_word) as u32;
309            let prev_mask = (word_mask << 1) | (prev_was_word as u32);
310            total_words += (word_mask & !prev_mask).count_ones() as u64;
311            prev_was_word = (word_mask >> 15) & 1 == 1;
312
313            batch += 1;
314            if batch >= 255 {
315                let sad = _mm_sad_epu8(line_acc, zero);
316                let hi = _mm_unpackhi_epi64(sad, sad);
317                let t = _mm_add_epi64(sad, hi);
318                total_lines += _mm_cvtsi128_si64(t) as u64;
319                line_acc = _mm_setzero_si128();
320                batch = 0;
321            }
322            i += 16;
323        }
324
325        if batch > 0 {
326            let sad = _mm_sad_epu8(line_acc, zero);
327            let hi = _mm_unpackhi_epi64(sad, sad);
328            let t = _mm_add_epi64(sad, hi);
329            total_lines += _mm_cvtsi128_si64(t) as u64;
330        }
331
332        // Scalar tail
333        while i < len {
334            let b = *ptr.add(i);
335            if b == b'\n' {
336                total_lines += 1;
337                prev_was_word = false;
338            } else if b > 0x20 {
339                if !prev_was_word {
340                    total_words += 1;
341                }
342                prev_was_word = true;
343            } else {
344                prev_was_word = false;
345            }
346            i += 1;
347        }
348    }
349
350    let first_is_word = !data.is_empty() && data[0] > 0x20;
351    (total_lines, total_words, first_is_word, prev_was_word)
352}
353
354/// Dispatch to AVX2, SSE2, or scalar chunk counter.
355#[inline]
356fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
357    #[cfg(target_arch = "x86_64")]
358    {
359        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
360            return unsafe { count_lw_c_chunk_avx2(data) };
361        }
362        if data.len() >= 32 {
363            return unsafe { count_lw_c_chunk_sse2(data) };
364        }
365    }
366    count_lw_c_chunk(data)
367}
368
369/// Count words + lines in a C locale chunk, returning counts plus boundary info.
370/// Used by parallel word counting.
371/// Returns (line_count, word_count, first_active_is_printable, ends_in_word).
372fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
373    let mut lines = 0u64;
374    let mut words = 0u64;
375    let mut in_word = false;
376    let mut first_active_is_printable = false;
377    let mut seen_active = false;
378    let mut i = 0;
379    let len = data.len();
380
381    while i < len {
382        let b = unsafe { *data.get_unchecked(i) };
383        if b >= 0x21 && b <= 0x7E {
384            // Printable ASCII
385            if !seen_active {
386                seen_active = true;
387                first_active_is_printable = true;
388            }
389            if !in_word {
390                in_word = true;
391                words += 1;
392            }
393            i += 1;
394            // Skip remaining printable ASCII
395            while i < len {
396                let b = unsafe { *data.get_unchecked(i) };
397                if b >= 0x21 && b <= 0x7E {
398                    i += 1;
399                } else {
400                    break;
401                }
402            }
403        } else if b == b'\n' {
404            lines += 1;
405            if !seen_active {
406                seen_active = true;
407            }
408            in_word = false;
409            i += 1;
410        } else {
411            let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
412            if class == 1 {
413                if !seen_active {
414                    seen_active = true;
415                }
416                in_word = false;
417            } else if class == 0 {
418                // NUL is printable in C locale — starts/continues word
419                if !seen_active {
420                    seen_active = true;
421                    first_active_is_printable = true;
422                }
423                if !in_word {
424                    in_word = true;
425                    words += 1;
426                }
427            }
428            i += 1;
429        }
430    }
431    (lines, words, first_active_is_printable, in_word)
432}
433
434/// Count words in UTF-8 locale using a state machine with 3-state logic.
435///
436/// Handles:
437/// - ASCII spaces (0x09-0x0D, 0x20): word break
438/// - ASCII printable (0x21-0x7E): word content
439/// - ASCII non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
440/// - Valid UTF-8 multi-byte → check Unicode space/printable
441/// - Invalid UTF-8: transparent (GNU wc skips invalid bytes without changing state)
442///
443/// Optimized with ASCII run skipping: when a word starts, skips remaining
444/// printable ASCII bytes without per-byte table lookups (~4x fewer state checks
445/// for English text with 5-char average word length).
446fn count_words_utf8(data: &[u8]) -> u64 {
447    let mut words = 0u64;
448    let mut in_word = false;
449    let mut i = 0;
450    let len = data.len();
451
452    while i < len {
453        let b = unsafe { *data.get_unchecked(i) };
454
455        if b >= 0x21 && b <= 0x7E {
456            // Printable ASCII (most common case for text) — word content
457            if !in_word {
458                in_word = true;
459                words += 1;
460            }
461            i += 1;
462            // Skip remaining printable ASCII (they don't change state)
463            while i < len {
464                let b = unsafe { *data.get_unchecked(i) };
465                if b >= 0x21 && b <= 0x7E {
466                    i += 1;
467                } else {
468                    break;
469                }
470            }
471        } else if b < 0x80 {
472            // Non-printable ASCII: space/tab/newline/controls
473            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
474            if class == 1 {
475                in_word = false;
476            }
477            // class == 2: transparent (controls 0x00-0x08, 0x0E-0x1F, 0x7F)
478            i += 1;
479        } else if b < 0xC2 {
480            i += 1;
481        } else if b < 0xE0 {
482            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
483                let cp = ((b as u32 & 0x1F) << 6)
484                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
485                if is_unicode_space(cp) {
486                    in_word = false;
487                } else if is_unicode_printable(cp) {
488                    if !in_word {
489                        in_word = true;
490                        words += 1;
491                    }
492                }
493                i += 2;
494            } else {
495                i += 1;
496            }
497        } else if b < 0xF0 {
498            if i + 2 < len
499                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
500                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
501            {
502                let cp = ((b as u32 & 0x0F) << 12)
503                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
504                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
505                if is_unicode_space(cp) {
506                    in_word = false;
507                } else if is_unicode_printable(cp) {
508                    if !in_word {
509                        in_word = true;
510                        words += 1;
511                    }
512                }
513                i += 3;
514            } else {
515                i += 1;
516            }
517        } else if b < 0xF5 {
518            if i + 3 < len
519                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
520                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
521                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
522            {
523                let cp = ((b as u32 & 0x07) << 18)
524                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
525                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
526                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
527                if is_unicode_space(cp) {
528                    in_word = false;
529                } else if is_unicode_printable(cp) {
530                    if !in_word {
531                        in_word = true;
532                        words += 1;
533                    }
534                }
535                i += 4;
536            } else {
537                i += 1;
538            }
539        } else {
540            i += 1;
541        }
542    }
543
544    words
545}
546
547/// Count lines and words using optimized strategies per locale.
548/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
549/// C locale: AVX2 SIMD fused counter when available, scalar fallback otherwise.
550pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
551    if utf8 {
552        count_lines_words_utf8_fused(data)
553    } else {
554        let (lines, words, _, _) = count_lw_c_chunk_fast(data);
555        (lines, words)
556    }
557}
558
559/// Fused lines+words counting in UTF-8 mode (single pass).
560/// Avoids separate memchr pass for newlines by counting them inline with words.
561///
562/// Key optimization: ASCII run skipping. Once a word starts (printable ASCII byte),
563/// we skip remaining printable ASCII bytes without any per-byte state checks.
564/// For English text (avg word ~5 chars), this reduces state transitions by ~4x.
565fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
566    let mut lines = 0u64;
567    let mut words = 0u64;
568    let mut in_word = false;
569    let mut i = 0;
570    let len = data.len();
571
572    while i < len {
573        let b = unsafe { *data.get_unchecked(i) };
574
575        if b >= 0x21 && b <= 0x7E {
576            // Printable ASCII (most common) — word content
577            if !in_word {
578                in_word = true;
579                words += 1;
580            }
581            i += 1;
582            // Skip remaining printable ASCII (they don't change state or count lines)
583            while i < len {
584                let b = unsafe { *data.get_unchecked(i) };
585                if b >= 0x21 && b <= 0x7E {
586                    i += 1;
587                } else {
588                    break;
589                }
590            }
591        } else if b == b'\n' {
592            lines += 1;
593            in_word = false;
594            i += 1;
595        } else if b == b' ' {
596            in_word = false;
597            i += 1;
598        } else if b < 0x80 {
599            // Other ASCII: \t, \r, \v, \f, controls
600            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
601            if class == 1 {
602                in_word = false;
603            }
604            // class == 2: transparent
605            i += 1;
606        } else if b < 0xC2 {
607            i += 1;
608        } else if b < 0xE0 {
609            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
610                let cp = ((b as u32 & 0x1F) << 6)
611                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
612                if is_unicode_space(cp) {
613                    in_word = false;
614                } else if is_unicode_printable(cp) {
615                    if !in_word {
616                        in_word = true;
617                        words += 1;
618                    }
619                }
620                i += 2;
621            } else {
622                i += 1;
623            }
624        } else if b < 0xF0 {
625            if i + 2 < len
626                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
627                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
628            {
629                let cp = ((b as u32 & 0x0F) << 12)
630                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
631                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
632                if is_unicode_space(cp) {
633                    in_word = false;
634                } else if is_unicode_printable(cp) {
635                    if !in_word {
636                        in_word = true;
637                        words += 1;
638                    }
639                }
640                i += 3;
641            } else {
642                i += 1;
643            }
644        } else if b < 0xF5 {
645            if i + 3 < len
646                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
647                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
648                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
649            {
650                let cp = ((b as u32 & 0x07) << 18)
651                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
652                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
653                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
654                if is_unicode_space(cp) {
655                    in_word = false;
656                } else if is_unicode_printable(cp) {
657                    if !in_word {
658                        in_word = true;
659                        words += 1;
660                    }
661                }
662                i += 4;
663            } else {
664                i += 1;
665            }
666        } else {
667            i += 1;
668        }
669    }
670
671    (lines, words)
672}
673
674/// Count lines, words, and chars using optimized strategies per locale.
675pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
676    if utf8 {
677        // Fused single-pass for lines+words, then fast char-counting pass
678        let (lines, words) = count_lines_words_utf8_fused(data);
679        let chars = count_chars_utf8(data);
680        (lines, words, chars)
681    } else {
682        // C locale: use optimized fused lines+words, chars = byte count
683        let (lines, words) = count_lines_words(data, false);
684        (lines, words, data.len() as u64)
685    }
686}
687
688/// Count UTF-8 characters by counting non-continuation bytes.
689/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
690/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
691///
692/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
693/// Falls back to 64-byte block processing with popcount on other architectures.
694pub fn count_chars_utf8(data: &[u8]) -> u64 {
695    #[cfg(target_arch = "x86_64")]
696    {
697        if is_x86_feature_detected!("avx2") {
698            return unsafe { count_chars_utf8_avx2(data) };
699        }
700    }
701    count_chars_utf8_scalar(data)
702}
703
704/// AVX2 SIMD character counter: counts non-continuation bytes using
705/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
706/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
707#[cfg(target_arch = "x86_64")]
708#[target_feature(enable = "avx2")]
709unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
710    unsafe {
711        use std::arch::x86_64::*;
712
713        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
714        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
715        let ones = _mm256_set1_epi8(1);
716        let zero = _mm256_setzero_si256();
717
718        let mut total = 0u64;
719        let len = data.len();
720        let ptr = data.as_ptr();
721        let mut i = 0;
722        let mut acc = _mm256_setzero_si256();
723        let mut batch = 0u32;
724
725        while i + 32 <= len {
726            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
727            let masked = _mm256_and_si256(v, mask_c0);
728            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
729            let non_cont = _mm256_andnot_si256(is_cont, ones);
730            acc = _mm256_add_epi8(acc, non_cont);
731
732            batch += 1;
733            if batch >= 255 {
734                // Horizontal sum via PSADBW: sum u8 differences against zero
735                let sad = _mm256_sad_epu8(acc, zero);
736                let hi = _mm256_extracti128_si256(sad, 1);
737                let lo = _mm256_castsi256_si128(sad);
738                let sum = _mm_add_epi64(lo, hi);
739                let hi64 = _mm_unpackhi_epi64(sum, sum);
740                let t = _mm_add_epi64(sum, hi64);
741                total += _mm_cvtsi128_si64(t) as u64;
742                acc = _mm256_setzero_si256();
743                batch = 0;
744            }
745            i += 32;
746        }
747
748        // Final horizontal sum
749        if batch > 0 {
750            let sad = _mm256_sad_epu8(acc, zero);
751            let hi = _mm256_extracti128_si256(sad, 1);
752            let lo = _mm256_castsi256_si128(sad);
753            let sum = _mm_add_epi64(lo, hi);
754            let hi64 = _mm_unpackhi_epi64(sum, sum);
755            let t = _mm_add_epi64(sum, hi64);
756            total += _mm_cvtsi128_si64(t) as u64;
757        }
758
759        while i < len {
760            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
761            i += 1;
762        }
763
764        total
765    }
766}
767
768/// Scalar fallback for count_chars_utf8.
769fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
770    let mut count = 0u64;
771    let chunks = data.chunks_exact(64);
772    let remainder = chunks.remainder();
773
774    for chunk in chunks {
775        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
776        let mut any_high = 0u8;
777        let mut i = 0;
778        while i + 8 <= 64 {
779            unsafe {
780                any_high |= *chunk.get_unchecked(i);
781                any_high |= *chunk.get_unchecked(i + 1);
782                any_high |= *chunk.get_unchecked(i + 2);
783                any_high |= *chunk.get_unchecked(i + 3);
784                any_high |= *chunk.get_unchecked(i + 4);
785                any_high |= *chunk.get_unchecked(i + 5);
786                any_high |= *chunk.get_unchecked(i + 6);
787                any_high |= *chunk.get_unchecked(i + 7);
788            }
789            i += 8;
790        }
791        if any_high < 0x80 {
792            count += 64;
793            continue;
794        }
795
796        let mut char_mask = 0u64;
797        i = 0;
798        while i + 7 < 64 {
799            unsafe {
800                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
801                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
802                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
803                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
804                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
805                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
806                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
807                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
808            }
809            i += 8;
810        }
811        count += char_mask.count_ones() as u64;
812    }
813
814    for &b in remainder {
815        count += ((b & 0xC0) != 0x80) as u64;
816    }
817    count
818}
819
820/// Count characters in C/POSIX locale (each byte is one character).
821#[inline]
822pub fn count_chars_c(data: &[u8]) -> u64 {
823    data.len() as u64
824}
825
826/// Count characters, choosing behavior based on locale.
827#[inline]
828pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
829    if utf8 {
830        count_chars_utf8(data)
831    } else {
832        count_chars_c(data)
833    }
834}
835
836/// Detect if the current locale uses UTF-8 encoding.
837pub fn is_utf8_locale() -> bool {
838    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
839        if let Ok(val) = std::env::var(var) {
840            if !val.is_empty() {
841                let lower = val.to_ascii_lowercase();
842                return lower.contains("utf-8") || lower.contains("utf8");
843            }
844        }
845    }
846    false
847}
848
849/// Decode one UTF-8 character from a byte slice.
850/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
851#[inline]
852fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
853    let b0 = bytes[0];
854    if b0 < 0x80 {
855        return (b0 as u32, 1);
856    }
857    if b0 < 0xC2 {
858        // Continuation byte or overlong 2-byte — invalid as start
859        return (b0 as u32, 1);
860    }
861    if b0 < 0xE0 {
862        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
863            return (b0 as u32, 1);
864        }
865        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
866        return (cp, 2);
867    }
868    if b0 < 0xF0 {
869        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
870            return (b0 as u32, 1);
871        }
872        let cp =
873            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
874        return (cp, 3);
875    }
876    if b0 < 0xF5 {
877        if bytes.len() < 4
878            || bytes[1] & 0xC0 != 0x80
879            || bytes[2] & 0xC0 != 0x80
880            || bytes[3] & 0xC0 != 0x80
881        {
882            return (b0 as u32, 1);
883        }
884        let cp = ((b0 as u32 & 0x07) << 18)
885            | ((bytes[1] as u32 & 0x3F) << 12)
886            | ((bytes[2] as u32 & 0x3F) << 6)
887            | (bytes[3] as u32 & 0x3F);
888        return (cp, 4);
889    }
890    (b0 as u32, 1)
891}
892
893/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
894/// GNU wc uses wcwidth() which returns 0 for these. We must match.
895#[inline]
896fn is_zero_width(cp: u32) -> bool {
897    matches!(
898        cp,
899        0x0300..=0x036F   // Combining Diacritical Marks
900        | 0x0483..=0x0489 // Cyrillic combining marks
901        | 0x0591..=0x05BD // Hebrew combining marks
902        | 0x05BF
903        | 0x05C1..=0x05C2
904        | 0x05C4..=0x05C5
905        | 0x05C7
906        | 0x0600..=0x0605 // Arabic number signs
907        | 0x0610..=0x061A // Arabic combining marks
908        | 0x064B..=0x065F // Arabic combining marks
909        | 0x0670
910        | 0x06D6..=0x06DD
911        | 0x06DF..=0x06E4
912        | 0x06E7..=0x06E8
913        | 0x06EA..=0x06ED
914        | 0x070F
915        | 0x0711
916        | 0x0730..=0x074A
917        | 0x07A6..=0x07B0
918        | 0x07EB..=0x07F3
919        | 0x07FD
920        | 0x0816..=0x0819
921        | 0x081B..=0x0823
922        | 0x0825..=0x0827
923        | 0x0829..=0x082D
924        | 0x0859..=0x085B
925        | 0x08D3..=0x08E1
926        | 0x08E3..=0x0902
927        | 0x093A
928        | 0x093C
929        | 0x0941..=0x0948
930        | 0x094D
931        | 0x0951..=0x0957
932        | 0x0962..=0x0963
933        | 0x0981
934        | 0x09BC
935        | 0x09C1..=0x09C4
936        | 0x09CD
937        | 0x09E2..=0x09E3
938        | 0x09FE
939        | 0x0A01..=0x0A02
940        | 0x0A3C
941        | 0x0A41..=0x0A42
942        | 0x0A47..=0x0A48
943        | 0x0A4B..=0x0A4D
944        | 0x0A51
945        | 0x0A70..=0x0A71
946        | 0x0A75
947        | 0x0A81..=0x0A82
948        | 0x0ABC
949        | 0x0AC1..=0x0AC5
950        | 0x0AC7..=0x0AC8
951        | 0x0ACD
952        | 0x0AE2..=0x0AE3
953        | 0x0AFA..=0x0AFF
954        | 0x0B01
955        | 0x0B3C
956        | 0x0B3F
957        | 0x0B41..=0x0B44
958        | 0x0B4D
959        | 0x0B56
960        | 0x0B62..=0x0B63
961        | 0x0B82
962        | 0x0BC0
963        | 0x0BCD
964        | 0x0C00
965        | 0x0C04
966        | 0x0C3E..=0x0C40
967        | 0x0C46..=0x0C48
968        | 0x0C4A..=0x0C4D
969        | 0x0C55..=0x0C56
970        | 0x0C62..=0x0C63
971        | 0x0C81
972        | 0x0CBC
973        | 0x0CBF
974        | 0x0CC6
975        | 0x0CCC..=0x0CCD
976        | 0x0CE2..=0x0CE3
977        | 0x0D00..=0x0D01
978        | 0x0D3B..=0x0D3C
979        | 0x0D41..=0x0D44
980        | 0x0D4D
981        | 0x0D62..=0x0D63
982        | 0x0DCA
983        | 0x0DD2..=0x0DD4
984        | 0x0DD6
985        | 0x0E31
986        | 0x0E34..=0x0E3A
987        | 0x0E47..=0x0E4E
988        | 0x0EB1
989        | 0x0EB4..=0x0EBC
990        | 0x0EC8..=0x0ECD
991        | 0x0F18..=0x0F19
992        | 0x0F35
993        | 0x0F37
994        | 0x0F39
995        | 0x0F71..=0x0F7E
996        | 0x0F80..=0x0F84
997        | 0x0F86..=0x0F87
998        | 0x0F8D..=0x0F97
999        | 0x0F99..=0x0FBC
1000        | 0x0FC6
1001        | 0x102D..=0x1030
1002        | 0x1032..=0x1037
1003        | 0x1039..=0x103A
1004        | 0x103D..=0x103E
1005        | 0x1058..=0x1059
1006        | 0x105E..=0x1060
1007        | 0x1071..=0x1074
1008        | 0x1082
1009        | 0x1085..=0x1086
1010        | 0x108D
1011        | 0x109D
1012        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
1013        | 0x135D..=0x135F
1014        | 0x1712..=0x1714
1015        | 0x1732..=0x1734
1016        | 0x1752..=0x1753
1017        | 0x1772..=0x1773
1018        | 0x17B4..=0x17B5
1019        | 0x17B7..=0x17BD
1020        | 0x17C6
1021        | 0x17C9..=0x17D3
1022        | 0x17DD
1023        | 0x180B..=0x180D
1024        | 0x1885..=0x1886
1025        | 0x18A9
1026        | 0x1920..=0x1922
1027        | 0x1927..=0x1928
1028        | 0x1932
1029        | 0x1939..=0x193B
1030        | 0x1A17..=0x1A18
1031        | 0x1A1B
1032        | 0x1A56
1033        | 0x1A58..=0x1A5E
1034        | 0x1A60
1035        | 0x1A62
1036        | 0x1A65..=0x1A6C
1037        | 0x1A73..=0x1A7C
1038        | 0x1A7F
1039        | 0x1AB0..=0x1ABE
1040        | 0x1B00..=0x1B03
1041        | 0x1B34
1042        | 0x1B36..=0x1B3A
1043        | 0x1B3C
1044        | 0x1B42
1045        | 0x1B6B..=0x1B73
1046        | 0x1B80..=0x1B81
1047        | 0x1BA2..=0x1BA5
1048        | 0x1BA8..=0x1BA9
1049        | 0x1BAB..=0x1BAD
1050        | 0x1BE6
1051        | 0x1BE8..=0x1BE9
1052        | 0x1BED
1053        | 0x1BEF..=0x1BF1
1054        | 0x1C2C..=0x1C33
1055        | 0x1C36..=0x1C37
1056        | 0x1CD0..=0x1CD2
1057        | 0x1CD4..=0x1CE0
1058        | 0x1CE2..=0x1CE8
1059        | 0x1CED
1060        | 0x1CF4
1061        | 0x1CF8..=0x1CF9
1062        | 0x1DC0..=0x1DF9
1063        | 0x1DFB..=0x1DFF
1064        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
1065        | 0x202A..=0x202E // Bidi control chars
1066        | 0x2060..=0x2064 // Word joiner, invisible operators
1067        | 0x2066..=0x206F // Bidi isolates
1068        | 0x20D0..=0x20F0 // Combining marks for symbols
1069        | 0xFE00..=0xFE0F // Variation Selectors
1070        | 0xFE20..=0xFE2F // Combining Half Marks
1071        | 0xFEFF          // Zero Width No-Break Space (BOM)
1072        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
1073        | 0x1D167..=0x1D169
1074        | 0x1D173..=0x1D182
1075        | 0x1D185..=0x1D18B
1076        | 0x1D1AA..=0x1D1AD
1077        | 0x1D242..=0x1D244
1078        | 0xE0001
1079        | 0xE0020..=0xE007F
1080        | 0xE0100..=0xE01EF // Variation Selectors Supplement
1081    )
1082}
1083
1084/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
1085/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
1086#[inline]
1087fn is_wide_char(cp: u32) -> bool {
1088    matches!(
1089        cp,
1090        0x1100..=0x115F   // Hangul Jamo
1091        | 0x231A..=0x231B // Watch, Hourglass
1092        | 0x2329..=0x232A // Angle Brackets
1093        | 0x23E9..=0x23F3 // Various symbols
1094        | 0x23F8..=0x23FA
1095        | 0x25FD..=0x25FE
1096        | 0x2614..=0x2615
1097        | 0x2648..=0x2653
1098        | 0x267F
1099        | 0x2693
1100        | 0x26A1
1101        | 0x26AA..=0x26AB
1102        | 0x26BD..=0x26BE
1103        | 0x26C4..=0x26C5
1104        | 0x26CE
1105        | 0x26D4
1106        | 0x26EA
1107        | 0x26F2..=0x26F3
1108        | 0x26F5
1109        | 0x26FA
1110        | 0x26FD
1111        | 0x2702
1112        | 0x2705
1113        | 0x2708..=0x270D
1114        | 0x270F
1115        | 0x2712
1116        | 0x2714
1117        | 0x2716
1118        | 0x271D
1119        | 0x2721
1120        | 0x2728
1121        | 0x2733..=0x2734
1122        | 0x2744
1123        | 0x2747
1124        | 0x274C
1125        | 0x274E
1126        | 0x2753..=0x2755
1127        | 0x2757
1128        | 0x2763..=0x2764
1129        | 0x2795..=0x2797
1130        | 0x27A1
1131        | 0x27B0
1132        | 0x27BF
1133        | 0x2934..=0x2935
1134        | 0x2B05..=0x2B07
1135        | 0x2B1B..=0x2B1C
1136        | 0x2B50
1137        | 0x2B55
1138        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
1139        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
1140        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1141        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1142        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1143        | 0xAC00..=0xD7A3  // Hangul Syllables
1144        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1145        | 0xFE10..=0xFE19  // Vertical Forms
1146        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1147        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1148        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1149        | 0x1F004
1150        | 0x1F0CF
1151        | 0x1F170..=0x1F171
1152        | 0x1F17E..=0x1F17F
1153        | 0x1F18E
1154        | 0x1F191..=0x1F19A
1155        | 0x1F1E0..=0x1F1FF // Regional Indicators
1156        | 0x1F200..=0x1F202
1157        | 0x1F210..=0x1F23B
1158        | 0x1F240..=0x1F248
1159        | 0x1F250..=0x1F251
1160        | 0x1F260..=0x1F265
1161        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1162        | 0x1F680..=0x1F6FF // Transport Symbols
1163        | 0x1F900..=0x1F9FF // Supplemental Symbols
1164        | 0x1FA00..=0x1FA6F
1165        | 0x1FA70..=0x1FAFF
1166        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1167        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1168    )
1169}
1170
1171/// Compute maximum display width of any line (C/POSIX locale).
1172///
1173/// GNU wc -L behavior in C locale:
1174/// - `\n`: line terminator (records max, resets position)
1175/// - `\t`: advances to next tab stop (multiple of 8)
1176/// - `\r`: carriage return (resets position to 0, same line)
1177/// - `\f`: form feed (acts as line terminator like \n)
1178/// - Printable ASCII (0x20..0x7E): width 1
1179/// - Everything else (controls, high bytes): width 0
1180///
1181/// Optimized with printable ASCII run counting: for runs of bytes in
1182/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1183pub fn max_line_length_c(data: &[u8]) -> u64 {
1184    let mut max_len: u64 = 0;
1185    let mut line_len: u64 = 0;
1186    let mut linepos: u64 = 0;
1187    let mut i = 0;
1188    let len = data.len();
1189
1190    while i < len {
1191        let b = unsafe { *data.get_unchecked(i) };
1192        if b >= 0x21 && b <= 0x7E {
1193            // Printable non-space ASCII — count run length
1194            i += 1;
1195            let mut run = 1u64;
1196            while i < len {
1197                let b = unsafe { *data.get_unchecked(i) };
1198                if b >= 0x21 && b <= 0x7E {
1199                    run += 1;
1200                    i += 1;
1201                } else {
1202                    break;
1203                }
1204            }
1205            linepos += run;
1206            if linepos > line_len {
1207                line_len = linepos;
1208            }
1209        } else {
1210            match b {
1211                b' ' => {
1212                    linepos += 1;
1213                    if linepos > line_len {
1214                        line_len = linepos;
1215                    }
1216                }
1217                b'\n' => {
1218                    if line_len > max_len {
1219                        max_len = line_len;
1220                    }
1221                    linepos = 0;
1222                    line_len = 0;
1223                }
1224                b'\t' => {
1225                    linepos = (linepos + 8) & !7;
1226                    if linepos > line_len {
1227                        line_len = linepos;
1228                    }
1229                }
1230                b'\r' => {
1231                    linepos = 0;
1232                }
1233                0x0C => {
1234                    if line_len > max_len {
1235                        max_len = line_len;
1236                    }
1237                    linepos = 0;
1238                    line_len = 0;
1239                }
1240                _ => {} // Non-printable: width 0
1241            }
1242            i += 1;
1243        }
1244    }
1245
1246    if line_len > max_len {
1247        max_len = line_len;
1248    }
1249
1250    max_len
1251}
1252
1253/// Compute maximum display width of any line (UTF-8 locale).
1254///
1255/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1256/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1257///
1258/// Optimized with printable ASCII run counting for common text.
1259pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1260    let mut max_len: u64 = 0;
1261    let mut line_len: u64 = 0;
1262    let mut linepos: u64 = 0;
1263    let mut i = 0;
1264    let len = data.len();
1265
1266    while i < len {
1267        let b = unsafe { *data.get_unchecked(i) };
1268
1269        if b >= 0x21 && b <= 0x7E {
1270            // Printable non-space ASCII (most common) — count run length
1271            i += 1;
1272            let mut run = 1u64;
1273            while i < len {
1274                let b = unsafe { *data.get_unchecked(i) };
1275                if b >= 0x21 && b <= 0x7E {
1276                    run += 1;
1277                    i += 1;
1278                } else {
1279                    break;
1280                }
1281            }
1282            linepos += run;
1283            if linepos > line_len {
1284                line_len = linepos;
1285            }
1286        } else if b < 0x80 {
1287            // Other ASCII: space, tab, newline, controls
1288            match b {
1289                b' ' => {
1290                    linepos += 1;
1291                    if linepos > line_len {
1292                        line_len = linepos;
1293                    }
1294                }
1295                b'\n' => {
1296                    if line_len > max_len {
1297                        max_len = line_len;
1298                    }
1299                    linepos = 0;
1300                    line_len = 0;
1301                }
1302                b'\t' => {
1303                    linepos = (linepos + 8) & !7;
1304                    if linepos > line_len {
1305                        line_len = linepos;
1306                    }
1307                }
1308                b'\r' => {
1309                    linepos = 0;
1310                }
1311                0x0C => {
1312                    if line_len > max_len {
1313                        max_len = line_len;
1314                    }
1315                    linepos = 0;
1316                    line_len = 0;
1317                }
1318                _ => {} // Non-printable: width 0
1319            }
1320            i += 1;
1321        } else {
1322            // Multibyte UTF-8
1323            let (cp, len) = decode_utf8(&data[i..]);
1324
1325            // C1 control characters (0x80..0x9F): non-printable, width 0
1326            if cp <= 0x9F {
1327                // width 0
1328            } else if is_zero_width(cp) {
1329                // Combining marks, zero-width chars: width 0
1330            } else if is_wide_char(cp) {
1331                linepos += 2;
1332                if linepos > line_len {
1333                    line_len = linepos;
1334                }
1335            } else {
1336                // Regular printable Unicode character: width 1
1337                linepos += 1;
1338                if linepos > line_len {
1339                    line_len = linepos;
1340                }
1341            }
1342            i += len;
1343        }
1344    }
1345
1346    // Handle last line
1347    if line_len > max_len {
1348        max_len = line_len;
1349    }
1350
1351    max_len
1352}
1353
1354/// Compute maximum display width, choosing behavior based on locale.
1355#[inline]
1356pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1357    if utf8 {
1358        max_line_length_utf8(data)
1359    } else {
1360        max_line_length_c(data)
1361    }
1362}
1363
1364/// Count all metrics using optimized individual passes.
1365///
1366/// Each metric uses its own optimized algorithm:
1367/// - Lines: SIMD-accelerated memchr
1368/// - Words: 3-state scalar/state-machine (locale-dependent)
1369/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1370/// - Max line length: locale-aware display width tracking
1371///
1372/// Multi-pass is faster than single-pass because each pass has a tight,
1373/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1374/// making subsequent passes nearly free for memory bandwidth.
1375pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1376    if utf8 {
1377        let (lines, words) = count_lines_words_utf8_fused(data);
1378        WcCounts {
1379            lines,
1380            words,
1381            bytes: data.len() as u64,
1382            chars: count_chars_utf8(data),
1383            max_line_length: max_line_length_utf8(data),
1384        }
1385    } else {
1386        WcCounts {
1387            lines: count_lines(data),
1388            words: count_words_locale(data, false),
1389            bytes: data.len() as u64,
1390            chars: data.len() as u64,
1391            max_line_length: max_line_length_c(data),
1392        }
1393    }
1394}
1395
1396/// Quick check if data is likely all-ASCII by sampling three regions.
1397/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1398/// If any byte >= 0x80 is found, returns false.
1399#[inline]
1400fn check_ascii_sample(data: &[u8]) -> bool {
1401    let len = data.len();
1402    if len == 0 {
1403        return true;
1404    }
1405
1406    // Check in 8-byte blocks using OR-accumulation for speed
1407    let check_region = |start: usize, end: usize| -> bool {
1408        let mut or_acc = 0u8;
1409        let region = &data[start..end];
1410        let mut i = 0;
1411        while i + 8 <= region.len() {
1412            unsafe {
1413                or_acc |= *region.get_unchecked(i);
1414                or_acc |= *region.get_unchecked(i + 1);
1415                or_acc |= *region.get_unchecked(i + 2);
1416                or_acc |= *region.get_unchecked(i + 3);
1417                or_acc |= *region.get_unchecked(i + 4);
1418                or_acc |= *region.get_unchecked(i + 5);
1419                or_acc |= *region.get_unchecked(i + 6);
1420                or_acc |= *region.get_unchecked(i + 7);
1421            }
1422            i += 8;
1423        }
1424        while i < region.len() {
1425            or_acc |= region[i];
1426            i += 1;
1427        }
1428        or_acc < 0x80
1429    };
1430
1431    let sample = 256.min(len);
1432
1433    // Check beginning
1434    if !check_region(0, sample) {
1435        return false;
1436    }
1437    // Check middle
1438    if len > sample * 2 {
1439        let mid = len / 2;
1440        let mid_start = mid.saturating_sub(sample / 2);
1441        if !check_region(mid_start, (mid_start + sample).min(len)) {
1442            return false;
1443        }
1444    }
1445    // Check end
1446    if len > sample {
1447        if !check_region(len - sample, len) {
1448            return false;
1449        }
1450    }
1451
1452    true
1453}
1454
1455// ──────────────────────────────────────────────────
1456// Parallel counting for large files
1457// ──────────────────────────────────────────────────
1458
1459/// Count newlines in parallel using SIMD memchr + rayon.
1460/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1461pub fn count_lines_parallel(data: &[u8]) -> u64 {
1462    if data.len() < PARALLEL_THRESHOLD {
1463        return count_lines(data);
1464    }
1465
1466    let num_threads = rayon::current_num_threads().max(1);
1467    // Ensure chunks are large enough to amortize SIMD setup overhead
1468    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1469
1470    data.par_chunks(chunk_size)
1471        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1472        .sum()
1473}
1474
1475/// Count words in parallel with boundary adjustment.
1476pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1477    if utf8 || data.len() < PARALLEL_THRESHOLD {
1478        // UTF-8: state machine can't be trivially parallelized
1479        // (multi-byte sequences may span chunk boundaries).
1480        return count_words_locale(data, utf8);
1481    }
1482
1483    // C locale: parallel 3-state word counting with boundary adjustment
1484    let num_threads = rayon::current_num_threads().max(1);
1485    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1486
1487    let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1488
1489    // Each chunk returns (lines, word_count, first_active_is_printable, ends_in_word)
1490    let results: Vec<(u64, u64, bool, bool)> = chunks
1491        .par_iter()
1492        .map(|chunk| count_lw_c_chunk(chunk))
1493        .collect();
1494
1495    let mut total = 0u64;
1496    for i in 0..results.len() {
1497        total += results[i].1;
1498        // Boundary adjustment: if previous chunk ended in_word AND
1499        // current chunk's first non-transparent byte is printable,
1500        // the word was split across chunks — subtract the overcount.
1501        if i > 0 && results[i - 1].3 && results[i].2 {
1502            total -= 1;
1503        }
1504    }
1505    total
1506}
1507
1508/// Count UTF-8 characters in parallel.
1509pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1510    if !utf8 {
1511        return data.len() as u64;
1512    }
1513    if data.len() < PARALLEL_THRESHOLD {
1514        return count_chars_utf8(data);
1515    }
1516
1517    let num_threads = rayon::current_num_threads().max(1);
1518    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1519
1520    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1521}
1522
1523/// Count lines + words + bytes in a single fused pass (the default wc mode).
1524/// Avoids separate passes entirely — combines newline counting with word detection.
1525pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1526    let (lines, words) = count_lines_words(data, utf8);
1527    (lines, words, data.len() as u64)
1528}
1529
1530/// Parallel counting of lines + words + bytes only (no chars).
1531/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1532/// C locale: single fused pass per chunk counts BOTH lines and words.
1533/// UTF-8 with pure ASCII data: falls back to parallel C locale path.
1534pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1535    if data.len() < PARALLEL_THRESHOLD {
1536        // Small file: use fused single-pass
1537        return count_lwb(data, utf8);
1538    }
1539
1540    // For UTF-8 locale: check if data is pure ASCII first.
1541    // If so, UTF-8 and C locale produce identical word counts,
1542    // and we can use the parallelizable C locale path.
1543    let effective_utf8 = if utf8 {
1544        // Quick ASCII check: sample first, middle, last 256 bytes
1545        let is_ascii = check_ascii_sample(data);
1546        if is_ascii {
1547            false // Use C locale parallel path
1548        } else {
1549            true // Need sequential UTF-8 path
1550        }
1551    } else {
1552        false
1553    };
1554
1555    let (lines, words) = if effective_utf8 {
1556        // Must be sequential for UTF-8 with non-ASCII data
1557        count_lines_words_utf8_fused(data)
1558    } else {
1559        // C locale: FUSED parallel lines+words counting — single pass per chunk
1560        let num_threads = rayon::current_num_threads().max(1);
1561        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1562
1563        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1564        let results: Vec<(u64, u64, bool, bool)> = chunks
1565            .par_iter()
1566            .map(|chunk| count_lw_c_chunk_fast(chunk))
1567            .collect();
1568
1569        let mut line_total = 0u64;
1570        let mut word_total = 0u64;
1571        for i in 0..results.len() {
1572            line_total += results[i].0;
1573            word_total += results[i].1;
1574            if i > 0 && results[i - 1].3 && results[i].2 {
1575                word_total -= 1;
1576            }
1577        }
1578
1579        (line_total, word_total)
1580    };
1581
1582    (lines, words, data.len() as u64)
1583}
1584
1585/// Combined parallel counting of lines + words + chars.
1586pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1587    if data.len() < PARALLEL_THRESHOLD {
1588        let lines = count_lines(data);
1589        let words = count_words_locale(data, utf8);
1590        let chars = count_chars(data, utf8);
1591        return (lines, words, chars);
1592    }
1593
1594    // Word counting: sequential for UTF-8 (state machine), parallel for C locale
1595    let words = count_words_parallel(data, utf8);
1596
1597    // Lines and chars can always be parallelized safely
1598    let num_threads = rayon::current_num_threads().max(1);
1599    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1600
1601    let lines: u64 = data
1602        .par_chunks(chunk_size)
1603        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1604        .sum();
1605
1606    let chars = if utf8 {
1607        data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1608    } else {
1609        data.len() as u64
1610    };
1611
1612    (lines, words, chars)
1613}