Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (1MB).
5/// Rayon overhead is ~5-10μs per task; at 1MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~100μs, so overhead is < 10%.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// Byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// GNU wc 9.7 uses a simple 2-state model for word counting:
24//   - Space (word-break): whitespace bytes (0x09-0x0D, 0x20)
25//   - Non-space (word content): everything else, including NUL, control chars,
26//     DEL, high bytes (0x80-0xFF), and encoding errors
27//
28// This matches the GNU wc source which uses `!wc_isspace[c]` to determine
29// if a byte is word content. There is no "transparent" state — every byte
30// either breaks a word or is part of a word.
31//
32// In UTF-8 locale with multibyte path:
33//   - ASCII bytes use the same wc_isspace table
34//   - Encoding errors (EILSEQ) are treated as word content
35//   - Valid multibyte chars: iswspace() determines break vs content
36//   - Non-breaking spaces (U+00A0, U+2007, U+202F, U+2060) are also
37//     treated as space (when POSIXLY_CORRECT is not set), matching GNU wc
38
39/// Byte-level space table matching GNU wc 9.7 `wc_isspace[]`.
40/// true = whitespace (word break), false = word content.
41/// Only the 6 standard C locale whitespace bytes are spaces.
42const fn make_is_space() -> [bool; 256] {
43    let mut t = [false; 256];
44    t[0x09] = true; // tab
45    t[0x0A] = true; // newline
46    t[0x0B] = true; // vertical tab
47    t[0x0C] = true; // form feed
48    t[0x0D] = true; // carriage return
49    t[0x20] = true; // space
50    t
51}
52const IS_SPACE: [bool; 256] = make_is_space();
53
54/// For parallel chunk merging: determine if a chunk starts with word content
55/// (i.e., the first byte is not whitespace).
56#[inline]
57pub(crate) fn first_is_word(data: &[u8]) -> bool {
58    !data.is_empty() && !IS_SPACE[data[0] as usize]
59}
60
61// ──────────────────────────────────────────────────
62// Unicode character classification helpers
63// ──────────────────────────────────────────────────
64
65/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
66/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
67#[inline]
68fn is_unicode_space(cp: u32) -> bool {
69    matches!(
70        cp,
71        0x1680 |           // Ogham Space Mark
72        0x2000
73            ..=0x200A |  // En Quad through Hair Space
74        0x2028 |           // Line Separator
75        0x2029 |           // Paragraph Separator
76        0x205F |           // Medium Mathematical Space
77        0x3000 // Ideographic Space
78    )
79}
80
81/// Check if a Unicode codepoint is a "non-breaking space" that GNU wc treats
82/// as a word separator (when POSIXLY_CORRECT is not set).
83/// Matches GNU wc 9.7 iswnbspace(): U+00A0, U+2007, U+202F, U+2060.
84#[inline]
85fn is_wnbspace(cp: u32) -> bool {
86    matches!(cp, 0x00A0 | 0x2007 | 0x202F | 0x2060)
87}
88
89/// Check if a Unicode codepoint is any kind of space (iswspace || iswnbspace).
90#[inline]
91fn is_unicode_word_break(cp: u32) -> bool {
92    is_unicode_space(cp) || is_wnbspace(cp)
93}
94
95// ──────────────────────────────────────────────────
96// Core counting functions
97// ──────────────────────────────────────────────────
98
99/// Count newlines using SIMD-accelerated memchr.
100/// GNU wc counts newline bytes (`\n`), not logical lines.
101#[inline]
102pub fn count_lines(data: &[u8]) -> u64 {
103    memchr_iter(b'\n', data).count() as u64
104}
105
106/// Count bytes. Trivial but included for API consistency.
107#[inline]
108pub fn count_bytes(data: &[u8]) -> u64 {
109    data.len() as u64
110}
111
112/// Count words using locale-aware 2-state logic (default: UTF-8).
113pub fn count_words(data: &[u8]) -> u64 {
114    count_words_locale(data, true)
115}
116
117/// Count words with explicit locale control using 2-state logic.
118///
119/// GNU wc classifies each byte/character as:
120///   - space (whitespace): sets in_word=false
121///   - word content (everything else): sets in_word=true, increments word count on transition
122pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
123    if utf8 {
124        count_words_utf8(data)
125    } else {
126        count_words_c(data)
127    }
128}
129
130/// Count words in C/POSIX locale using 2-state logic matching GNU wc 9.7.
131/// Every byte is either whitespace (0x09-0x0D, 0x20) or word content (everything else).
132/// NUL bytes, control chars, DEL, and high bytes (0x80-0xFF) are all word content.
133fn count_words_c(data: &[u8]) -> u64 {
134    let mut words = 0u64;
135    let mut in_word = false;
136    let mut i = 0;
137    let len = data.len();
138
139    while i < len {
140        let b = unsafe { *data.get_unchecked(i) };
141        if IS_SPACE[b as usize] {
142            in_word = false;
143        } else if !in_word {
144            in_word = true;
145            words += 1;
146        }
147        i += 1;
148    }
149    words
150}
151
152/// Scalar tail for SIMD line+word counters: processes remaining bytes after
153/// the SIMD loop and returns final counts with boundary info.
154/// SAFETY: caller must ensure ptr is valid for [0..len) and i <= len.
155#[cfg(target_arch = "x86_64")]
156#[inline(always)]
157fn count_lw_c_scalar_tail(
158    ptr: *const u8,
159    mut i: usize,
160    len: usize,
161    mut total_lines: u64,
162    mut total_words: u64,
163    mut prev_in_word: bool,
164    data: &[u8],
165) -> (u64, u64, bool, bool) {
166    while i < len {
167        let b = unsafe { *ptr.add(i) };
168        if IS_SPACE[b as usize] {
169            if b == b'\n' {
170                total_lines += 1;
171            }
172            prev_in_word = false;
173        } else if !prev_in_word {
174            total_words += 1;
175            prev_in_word = true;
176        }
177        i += 1;
178    }
179    let first_word = first_is_word(data);
180    (total_lines, total_words, first_word, prev_in_word)
181}
182
183/// AVX2-accelerated fused line+word counter for C locale chunks.
184/// Processes 32 bytes per iteration using 2-state logic matching GNU wc 9.7:
185///   - Space: {0x09-0x0D, 0x20} (6 bytes) — ends word
186///   - Non-space: everything else — starts/continues word
187/// Word transitions detected via bitmask: space-to-nonspace transitions.
188#[cfg(target_arch = "x86_64")]
189#[target_feature(enable = "avx2")]
190unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
191    use std::arch::x86_64::*;
192
193    let len = data.len();
194    let ptr = data.as_ptr();
195    let mut i = 0usize;
196    let mut total_lines = 0u64;
197    let mut total_words = 0u64;
198    let mut prev_in_word = false;
199
200    unsafe {
201        let nl_byte = _mm256_set1_epi8(b'\n' as i8);
202        let zero = _mm256_setzero_si256();
203        let ones = _mm256_set1_epi8(1);
204        // Space detection: {0x09-0x0D, 0x20}
205        let const_0x09 = _mm256_set1_epi8(0x09u8 as i8);
206        let const_0x0d = _mm256_set1_epi8(0x0Du8 as i8);
207        let const_0x20 = _mm256_set1_epi8(0x20u8 as i8);
208
209        let mut line_acc = _mm256_setzero_si256();
210        let mut batch = 0u32;
211
212        while i + 32 <= len {
213            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
214            let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
215            line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
216
217            // Space check: byte in {0x09-0x0D, 0x20}
218            let ge_09 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x09), v);
219            let le_0d = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x0d), v);
220            let in_tab_range = _mm256_and_si256(ge_09, le_0d);
221            let is_sp = _mm256_cmpeq_epi8(v, const_0x20);
222            let is_space = _mm256_or_si256(in_tab_range, is_sp);
223            let space_mask = _mm256_movemask_epi8(is_space) as u32;
224
225            // 2-state: non-space = word content, space = break
226            // Word starts = positions where byte is non-space AND previous byte was space
227            let nonspace_mask = !space_mask;
228            // Build "previous was space" mask: shift space_mask left by 1, inject prev state
229            let prev_space = (space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 };
230            let starts = nonspace_mask & prev_space;
231            total_words += starts.count_ones() as u64;
232
233            // Update prev_in_word: last byte of this chunk is non-space?
234            prev_in_word = (nonspace_mask >> 31) & 1 == 1;
235
236            batch += 1;
237            if batch >= 255 {
238                let sad = _mm256_sad_epu8(line_acc, zero);
239                let hi = _mm256_extracti128_si256(sad, 1);
240                let lo = _mm256_castsi256_si128(sad);
241                let s = _mm_add_epi64(lo, hi);
242                let h64 = _mm_unpackhi_epi64(s, s);
243                let t = _mm_add_epi64(s, h64);
244                total_lines += _mm_cvtsi128_si64(t) as u64;
245                line_acc = _mm256_setzero_si256();
246                batch = 0;
247            }
248            i += 32;
249        }
250
251        if batch > 0 {
252            let sad = _mm256_sad_epu8(line_acc, zero);
253            let hi = _mm256_extracti128_si256(sad, 1);
254            let lo = _mm256_castsi256_si128(sad);
255            let s = _mm_add_epi64(lo, hi);
256            let h64 = _mm_unpackhi_epi64(s, s);
257            let t = _mm_add_epi64(s, h64);
258            total_lines += _mm_cvtsi128_si64(t) as u64;
259        }
260    }
261
262    count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
263}
264
265/// SSE2 variant of count_lw_c_chunk_avx2 — processes 16 bytes per iteration.
266/// See AVX2 function above for algorithm details.
267#[cfg(target_arch = "x86_64")]
268#[target_feature(enable = "sse2")]
269unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
270    use std::arch::x86_64::*;
271
272    let len = data.len();
273    let ptr = data.as_ptr();
274    let mut i = 0usize;
275    let mut total_lines = 0u64;
276    let mut total_words = 0u64;
277    let mut prev_in_word = false;
278
279    unsafe {
280        let nl_byte = _mm_set1_epi8(b'\n' as i8);
281        let zero = _mm_setzero_si128();
282        let ones = _mm_set1_epi8(1);
283        // Space detection: {0x09-0x0D, 0x20}
284        let const_0x09 = _mm_set1_epi8(0x09u8 as i8);
285        let const_0x0d = _mm_set1_epi8(0x0Du8 as i8);
286        let const_0x20 = _mm_set1_epi8(0x20u8 as i8);
287
288        let mut line_acc = _mm_setzero_si128();
289        let mut batch = 0u32;
290
291        while i + 16 <= len {
292            let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
293            let is_nl = _mm_cmpeq_epi8(v, nl_byte);
294            line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
295
296            // Space check: byte in {0x09-0x0D, 0x20}
297            let ge_09 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x09), v);
298            let le_0d = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x0d), v);
299            let in_tab_range = _mm_and_si128(ge_09, le_0d);
300            let is_sp = _mm_cmpeq_epi8(v, const_0x20);
301            let is_space = _mm_or_si128(in_tab_range, is_sp);
302            let space_mask = (_mm_movemask_epi8(is_space) as u32) & 0xFFFF;
303
304            // 2-state word start detection
305            let nonspace_mask = !space_mask & 0xFFFF;
306            let prev_space = ((space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 }) & 0xFFFF;
307            let starts = nonspace_mask & prev_space;
308            total_words += starts.count_ones() as u64;
309
310            prev_in_word = (nonspace_mask >> 15) & 1 == 1;
311
312            batch += 1;
313            if batch >= 255 {
314                let sad = _mm_sad_epu8(line_acc, zero);
315                let hi = _mm_unpackhi_epi64(sad, sad);
316                let t = _mm_add_epi64(sad, hi);
317                total_lines += _mm_cvtsi128_si64(t) as u64;
318                line_acc = _mm_setzero_si128();
319                batch = 0;
320            }
321            i += 16;
322        }
323
324        if batch > 0 {
325            let sad = _mm_sad_epu8(line_acc, zero);
326            let hi = _mm_unpackhi_epi64(sad, sad);
327            let t = _mm_add_epi64(sad, hi);
328            total_lines += _mm_cvtsi128_si64(t) as u64;
329        }
330    }
331
332    count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
333}
334
335/// Dispatch to AVX2, SSE2, or scalar chunk counter.
336#[inline]
337fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
338    #[cfg(target_arch = "x86_64")]
339    {
340        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
341            return unsafe { count_lw_c_chunk_avx2(data) };
342        }
343        if data.len() >= 32 {
344            return unsafe { count_lw_c_chunk_sse2(data) };
345        }
346    }
347    count_lw_c_chunk(data)
348}
349
350/// Count words + lines in a C locale chunk using 2-state logic, returning
351/// counts plus boundary info for parallel chunk merging.
352/// Returns (line_count, word_count, first_is_word_content, ends_in_word).
353fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
354    let mut lines = 0u64;
355    let mut words = 0u64;
356    let mut in_word = false;
357    let mut i = 0;
358    let len = data.len();
359
360    let first_word = first_is_word(data);
361
362    while i < len {
363        let b = unsafe { *data.get_unchecked(i) };
364        if IS_SPACE[b as usize] {
365            if b == b'\n' {
366                lines += 1;
367            }
368            in_word = false;
369        } else if !in_word {
370            in_word = true;
371            words += 1;
372        }
373        i += 1;
374    }
375    (lines, words, first_word, in_word)
376}
377
378/// Count words in UTF-8 locale using 2-state logic matching GNU wc 9.7.
379///
380/// Handles:
381/// - ASCII spaces (0x09-0x0D, 0x20): word break
382/// - All other ASCII bytes (including NUL, controls, DEL): word content
383/// - Valid UTF-8 multi-byte Unicode spaces (iswspace): word break
384/// - Non-breaking spaces (U+00A0, U+2007, U+202F, U+2060): word break (iswnbspace)
385/// - Valid UTF-8 multi-byte non-space chars: word content
386/// - Invalid UTF-8 encoding errors: word content (matches GNU wc EILSEQ handling)
387fn count_words_utf8(data: &[u8]) -> u64 {
388    let mut words = 0u64;
389    let mut in_word = false;
390    let mut i = 0;
391    let len = data.len();
392
393    while i < len {
394        let b = unsafe { *data.get_unchecked(i) };
395
396        if b < 0x80 {
397            // ASCII byte — 2-state: space or non-space
398            if IS_SPACE[b as usize] {
399                in_word = false;
400            } else if !in_word {
401                in_word = true;
402                words += 1;
403            }
404            i += 1;
405        } else if b < 0xC2 {
406            // Invalid UTF-8: bare continuation byte (0x80-0xBF) or overlong (0xC0-0xC1)
407            // Encoding error → word content (matches GNU wc EILSEQ handling)
408            if !in_word {
409                in_word = true;
410                words += 1;
411            }
412            i += 1;
413        } else if b < 0xE0 {
414            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
415                let cp = ((b as u32 & 0x1F) << 6)
416                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
417                if is_unicode_word_break(cp) {
418                    in_word = false;
419                } else if !in_word {
420                    in_word = true;
421                    words += 1;
422                }
423                i += 2;
424            } else {
425                // Incomplete sequence → word content (encoding error)
426                if !in_word {
427                    in_word = true;
428                    words += 1;
429                }
430                i += 1;
431            }
432        } else if b < 0xF0 {
433            if i + 2 < len
434                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
435                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
436            {
437                let cp = ((b as u32 & 0x0F) << 12)
438                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
439                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
440                if is_unicode_word_break(cp) {
441                    in_word = false;
442                } else if !in_word {
443                    in_word = true;
444                    words += 1;
445                }
446                i += 3;
447            } else {
448                // Incomplete sequence → word content (encoding error)
449                if !in_word {
450                    in_word = true;
451                    words += 1;
452                }
453                i += 1;
454            }
455        } else if b < 0xF5 {
456            if i + 3 < len
457                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
458                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
459                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
460            {
461                let cp = ((b as u32 & 0x07) << 18)
462                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
463                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
464                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
465                if is_unicode_word_break(cp) {
466                    in_word = false;
467                } else if !in_word {
468                    in_word = true;
469                    words += 1;
470                }
471                i += 4;
472            } else {
473                // Incomplete sequence → word content (encoding error)
474                if !in_word {
475                    in_word = true;
476                    words += 1;
477                }
478                i += 1;
479            }
480        } else {
481            // Invalid byte >= 0xF5 → word content (encoding error)
482            if !in_word {
483                in_word = true;
484                words += 1;
485            }
486            i += 1;
487        }
488    }
489
490    words
491}
492
493/// Count lines and words using optimized strategies per locale.
494/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
495/// C locale: AVX2 SIMD fused counter when available, scalar fallback otherwise.
496pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
497    if utf8 {
498        count_lines_words_utf8_fused(data)
499    } else {
500        let (lines, words, _, _) = count_lw_c_chunk_fast(data);
501        (lines, words)
502    }
503}
504
505/// Fused lines+words counting in UTF-8 mode (single pass).
506/// Avoids separate memchr pass for newlines by counting them inline with words.
507/// Uses 2-state logic matching GNU wc 9.7:
508///   - Encoding errors are word content (matching GNU wc EILSEQ handling)
509///   - ASCII non-space bytes (including NUL, controls) are word content
510///   - Valid multi-byte non-space chars are word content
511fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
512    let mut lines = 0u64;
513    let mut words = 0u64;
514    let mut in_word = false;
515    let mut i = 0;
516    let len = data.len();
517
518    while i < len {
519        let b = unsafe { *data.get_unchecked(i) };
520
521        if b == b'\n' {
522            lines += 1;
523            in_word = false;
524            i += 1;
525        } else if b < 0x80 {
526            // ASCII byte — 2-state: space or non-space
527            if IS_SPACE[b as usize] {
528                in_word = false;
529            } else if !in_word {
530                in_word = true;
531                words += 1;
532            }
533            i += 1;
534        } else if b < 0xC2 {
535            // Invalid UTF-8 → word content
536            if !in_word {
537                in_word = true;
538                words += 1;
539            }
540            i += 1;
541        } else if b < 0xE0 {
542            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
543                let cp = ((b as u32 & 0x1F) << 6)
544                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
545                if is_unicode_word_break(cp) {
546                    in_word = false;
547                } else if !in_word {
548                    in_word = true;
549                    words += 1;
550                }
551                i += 2;
552            } else {
553                if !in_word {
554                    in_word = true;
555                    words += 1;
556                }
557                i += 1;
558            }
559        } else if b < 0xF0 {
560            if i + 2 < len
561                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
562                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
563            {
564                let cp = ((b as u32 & 0x0F) << 12)
565                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
566                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
567                if is_unicode_word_break(cp) {
568                    in_word = false;
569                } else if !in_word {
570                    in_word = true;
571                    words += 1;
572                }
573                i += 3;
574            } else {
575                if !in_word {
576                    in_word = true;
577                    words += 1;
578                }
579                i += 1;
580            }
581        } else if b < 0xF5 {
582            if i + 3 < len
583                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
584                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
585                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
586            {
587                let cp = ((b as u32 & 0x07) << 18)
588                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
589                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
590                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
591                if is_unicode_word_break(cp) {
592                    in_word = false;
593                } else if !in_word {
594                    in_word = true;
595                    words += 1;
596                }
597                i += 4;
598            } else {
599                if !in_word {
600                    in_word = true;
601                    words += 1;
602                }
603                i += 1;
604            }
605        } else {
606            // Invalid byte >= 0xF5 → word content
607            if !in_word {
608                in_word = true;
609                words += 1;
610            }
611            i += 1;
612        }
613    }
614
615    (lines, words)
616}
617
618/// Count lines, words, and chars using optimized strategies per locale.
619pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
620    if utf8 {
621        // Fused single-pass for lines+words, then fast char-counting pass
622        let (lines, words) = count_lines_words_utf8_fused(data);
623        let chars = count_chars_utf8(data);
624        (lines, words, chars)
625    } else {
626        // C locale: use optimized fused lines+words, chars = byte count
627        let (lines, words) = count_lines_words(data, false);
628        (lines, words, data.len() as u64)
629    }
630}
631
632/// Count UTF-8 characters by counting non-continuation bytes.
633/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
634/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
635///
636/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
637/// Falls back to 64-byte block processing with popcount on other architectures.
638pub fn count_chars_utf8(data: &[u8]) -> u64 {
639    #[cfg(target_arch = "x86_64")]
640    {
641        if is_x86_feature_detected!("avx2") {
642            return unsafe { count_chars_utf8_avx2(data) };
643        }
644    }
645    count_chars_utf8_scalar(data)
646}
647
648/// AVX2 SIMD character counter: counts non-continuation bytes using
649/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
650/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
651#[cfg(target_arch = "x86_64")]
652#[target_feature(enable = "avx2")]
653unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
654    unsafe {
655        use std::arch::x86_64::*;
656
657        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
658        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
659        let ones = _mm256_set1_epi8(1);
660        let zero = _mm256_setzero_si256();
661
662        let mut total = 0u64;
663        let len = data.len();
664        let ptr = data.as_ptr();
665        let mut i = 0;
666        let mut acc = _mm256_setzero_si256();
667        let mut batch = 0u32;
668
669        while i + 32 <= len {
670            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
671            let masked = _mm256_and_si256(v, mask_c0);
672            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
673            let non_cont = _mm256_andnot_si256(is_cont, ones);
674            acc = _mm256_add_epi8(acc, non_cont);
675
676            batch += 1;
677            if batch >= 255 {
678                // Horizontal sum via PSADBW: sum u8 differences against zero
679                let sad = _mm256_sad_epu8(acc, zero);
680                let hi = _mm256_extracti128_si256(sad, 1);
681                let lo = _mm256_castsi256_si128(sad);
682                let sum = _mm_add_epi64(lo, hi);
683                let hi64 = _mm_unpackhi_epi64(sum, sum);
684                let t = _mm_add_epi64(sum, hi64);
685                total += _mm_cvtsi128_si64(t) as u64;
686                acc = _mm256_setzero_si256();
687                batch = 0;
688            }
689            i += 32;
690        }
691
692        // Final horizontal sum
693        if batch > 0 {
694            let sad = _mm256_sad_epu8(acc, zero);
695            let hi = _mm256_extracti128_si256(sad, 1);
696            let lo = _mm256_castsi256_si128(sad);
697            let sum = _mm_add_epi64(lo, hi);
698            let hi64 = _mm_unpackhi_epi64(sum, sum);
699            let t = _mm_add_epi64(sum, hi64);
700            total += _mm_cvtsi128_si64(t) as u64;
701        }
702
703        while i < len {
704            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
705            i += 1;
706        }
707
708        total
709    }
710}
711
712/// Scalar fallback for count_chars_utf8.
713fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
714    let mut count = 0u64;
715    let chunks = data.chunks_exact(64);
716    let remainder = chunks.remainder();
717
718    for chunk in chunks {
719        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
720        let mut any_high = 0u8;
721        let mut i = 0;
722        while i + 8 <= 64 {
723            unsafe {
724                any_high |= *chunk.get_unchecked(i);
725                any_high |= *chunk.get_unchecked(i + 1);
726                any_high |= *chunk.get_unchecked(i + 2);
727                any_high |= *chunk.get_unchecked(i + 3);
728                any_high |= *chunk.get_unchecked(i + 4);
729                any_high |= *chunk.get_unchecked(i + 5);
730                any_high |= *chunk.get_unchecked(i + 6);
731                any_high |= *chunk.get_unchecked(i + 7);
732            }
733            i += 8;
734        }
735        if any_high < 0x80 {
736            count += 64;
737            continue;
738        }
739
740        let mut char_mask = 0u64;
741        i = 0;
742        while i + 7 < 64 {
743            unsafe {
744                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
745                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
746                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
747                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
748                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
749                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
750                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
751                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
752            }
753            i += 8;
754        }
755        count += char_mask.count_ones() as u64;
756    }
757
758    for &b in remainder {
759        count += ((b & 0xC0) != 0x80) as u64;
760    }
761    count
762}
763
764/// Count characters in C/POSIX locale (each byte is one character).
765#[inline]
766pub fn count_chars_c(data: &[u8]) -> u64 {
767    data.len() as u64
768}
769
770/// Count characters, choosing behavior based on locale.
771#[inline]
772pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
773    if utf8 {
774        count_chars_utf8(data)
775    } else {
776        count_chars_c(data)
777    }
778}
779
780/// Detect if the current locale uses UTF-8 encoding.
781pub fn is_utf8_locale() -> bool {
782    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
783        if let Ok(val) = std::env::var(var) {
784            if !val.is_empty() {
785                let lower = val.to_ascii_lowercase();
786                return lower.contains("utf-8") || lower.contains("utf8");
787            }
788        }
789    }
790    false
791}
792
793/// Decode one UTF-8 character from a byte slice.
794/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
795#[inline]
796fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
797    let b0 = bytes[0];
798    if b0 < 0x80 {
799        return (b0 as u32, 1);
800    }
801    if b0 < 0xC2 {
802        // Continuation byte or overlong 2-byte — invalid as start
803        return (b0 as u32, 1);
804    }
805    if b0 < 0xE0 {
806        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
807            return (b0 as u32, 1);
808        }
809        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
810        return (cp, 2);
811    }
812    if b0 < 0xF0 {
813        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
814            return (b0 as u32, 1);
815        }
816        let cp =
817            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
818        return (cp, 3);
819    }
820    if b0 < 0xF5 {
821        if bytes.len() < 4
822            || bytes[1] & 0xC0 != 0x80
823            || bytes[2] & 0xC0 != 0x80
824            || bytes[3] & 0xC0 != 0x80
825        {
826            return (b0 as u32, 1);
827        }
828        let cp = ((b0 as u32 & 0x07) << 18)
829            | ((bytes[1] as u32 & 0x3F) << 12)
830            | ((bytes[2] as u32 & 0x3F) << 6)
831            | (bytes[3] as u32 & 0x3F);
832        return (cp, 4);
833    }
834    (b0 as u32, 1)
835}
836
837/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
838/// GNU wc uses wcwidth() which returns 0 for these. We must match.
839#[inline]
840fn is_zero_width(cp: u32) -> bool {
841    matches!(
842        cp,
843        0x0300..=0x036F   // Combining Diacritical Marks
844        | 0x0483..=0x0489 // Cyrillic combining marks
845        | 0x0591..=0x05BD // Hebrew combining marks
846        | 0x05BF
847        | 0x05C1..=0x05C2
848        | 0x05C4..=0x05C5
849        | 0x05C7
850        | 0x0600..=0x0605 // Arabic number signs
851        | 0x0610..=0x061A // Arabic combining marks
852        | 0x064B..=0x065F // Arabic combining marks
853        | 0x0670
854        | 0x06D6..=0x06DD
855        | 0x06DF..=0x06E4
856        | 0x06E7..=0x06E8
857        | 0x06EA..=0x06ED
858        | 0x070F
859        | 0x0711
860        | 0x0730..=0x074A
861        | 0x07A6..=0x07B0
862        | 0x07EB..=0x07F3
863        | 0x07FD
864        | 0x0816..=0x0819
865        | 0x081B..=0x0823
866        | 0x0825..=0x0827
867        | 0x0829..=0x082D
868        | 0x0859..=0x085B
869        | 0x08D3..=0x08E1
870        | 0x08E3..=0x0902
871        | 0x093A
872        | 0x093C
873        | 0x0941..=0x0948
874        | 0x094D
875        | 0x0951..=0x0957
876        | 0x0962..=0x0963
877        | 0x0981
878        | 0x09BC
879        | 0x09C1..=0x09C4
880        | 0x09CD
881        | 0x09E2..=0x09E3
882        | 0x09FE
883        | 0x0A01..=0x0A02
884        | 0x0A3C
885        | 0x0A41..=0x0A42
886        | 0x0A47..=0x0A48
887        | 0x0A4B..=0x0A4D
888        | 0x0A51
889        | 0x0A70..=0x0A71
890        | 0x0A75
891        | 0x0A81..=0x0A82
892        | 0x0ABC
893        | 0x0AC1..=0x0AC5
894        | 0x0AC7..=0x0AC8
895        | 0x0ACD
896        | 0x0AE2..=0x0AE3
897        | 0x0AFA..=0x0AFF
898        | 0x0B01
899        | 0x0B3C
900        | 0x0B3F
901        | 0x0B41..=0x0B44
902        | 0x0B4D
903        | 0x0B56
904        | 0x0B62..=0x0B63
905        | 0x0B82
906        | 0x0BC0
907        | 0x0BCD
908        | 0x0C00
909        | 0x0C04
910        | 0x0C3E..=0x0C40
911        | 0x0C46..=0x0C48
912        | 0x0C4A..=0x0C4D
913        | 0x0C55..=0x0C56
914        | 0x0C62..=0x0C63
915        | 0x0C81
916        | 0x0CBC
917        | 0x0CBF
918        | 0x0CC6
919        | 0x0CCC..=0x0CCD
920        | 0x0CE2..=0x0CE3
921        | 0x0D00..=0x0D01
922        | 0x0D3B..=0x0D3C
923        | 0x0D41..=0x0D44
924        | 0x0D4D
925        | 0x0D62..=0x0D63
926        | 0x0DCA
927        | 0x0DD2..=0x0DD4
928        | 0x0DD6
929        | 0x0E31
930        | 0x0E34..=0x0E3A
931        | 0x0E47..=0x0E4E
932        | 0x0EB1
933        | 0x0EB4..=0x0EBC
934        | 0x0EC8..=0x0ECD
935        | 0x0F18..=0x0F19
936        | 0x0F35
937        | 0x0F37
938        | 0x0F39
939        | 0x0F71..=0x0F7E
940        | 0x0F80..=0x0F84
941        | 0x0F86..=0x0F87
942        | 0x0F8D..=0x0F97
943        | 0x0F99..=0x0FBC
944        | 0x0FC6
945        | 0x102D..=0x1030
946        | 0x1032..=0x1037
947        | 0x1039..=0x103A
948        | 0x103D..=0x103E
949        | 0x1058..=0x1059
950        | 0x105E..=0x1060
951        | 0x1071..=0x1074
952        | 0x1082
953        | 0x1085..=0x1086
954        | 0x108D
955        | 0x109D
956        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
957        | 0x135D..=0x135F
958        | 0x1712..=0x1714
959        | 0x1732..=0x1734
960        | 0x1752..=0x1753
961        | 0x1772..=0x1773
962        | 0x17B4..=0x17B5
963        | 0x17B7..=0x17BD
964        | 0x17C6
965        | 0x17C9..=0x17D3
966        | 0x17DD
967        | 0x180B..=0x180D
968        | 0x1885..=0x1886
969        | 0x18A9
970        | 0x1920..=0x1922
971        | 0x1927..=0x1928
972        | 0x1932
973        | 0x1939..=0x193B
974        | 0x1A17..=0x1A18
975        | 0x1A1B
976        | 0x1A56
977        | 0x1A58..=0x1A5E
978        | 0x1A60
979        | 0x1A62
980        | 0x1A65..=0x1A6C
981        | 0x1A73..=0x1A7C
982        | 0x1A7F
983        | 0x1AB0..=0x1ABE
984        | 0x1B00..=0x1B03
985        | 0x1B34
986        | 0x1B36..=0x1B3A
987        | 0x1B3C
988        | 0x1B42
989        | 0x1B6B..=0x1B73
990        | 0x1B80..=0x1B81
991        | 0x1BA2..=0x1BA5
992        | 0x1BA8..=0x1BA9
993        | 0x1BAB..=0x1BAD
994        | 0x1BE6
995        | 0x1BE8..=0x1BE9
996        | 0x1BED
997        | 0x1BEF..=0x1BF1
998        | 0x1C2C..=0x1C33
999        | 0x1C36..=0x1C37
1000        | 0x1CD0..=0x1CD2
1001        | 0x1CD4..=0x1CE0
1002        | 0x1CE2..=0x1CE8
1003        | 0x1CED
1004        | 0x1CF4
1005        | 0x1CF8..=0x1CF9
1006        | 0x1DC0..=0x1DF9
1007        | 0x1DFB..=0x1DFF
1008        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
1009        | 0x202A..=0x202E // Bidi control chars
1010        | 0x2060..=0x2064 // Word joiner, invisible operators
1011        | 0x2066..=0x206F // Bidi isolates
1012        | 0x20D0..=0x20F0 // Combining marks for symbols
1013        | 0xFE00..=0xFE0F // Variation Selectors
1014        | 0xFE20..=0xFE2F // Combining Half Marks
1015        | 0xFEFF          // Zero Width No-Break Space (BOM)
1016        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
1017        | 0x1D167..=0x1D169
1018        | 0x1D173..=0x1D182
1019        | 0x1D185..=0x1D18B
1020        | 0x1D1AA..=0x1D1AD
1021        | 0x1D242..=0x1D244
1022        | 0xE0001
1023        | 0xE0020..=0xE007F
1024        | 0xE0100..=0xE01EF // Variation Selectors Supplement
1025    )
1026}
1027
1028/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
1029/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
1030#[inline]
1031fn is_wide_char(cp: u32) -> bool {
1032    matches!(
1033        cp,
1034        0x1100..=0x115F   // Hangul Jamo
1035        | 0x231A..=0x231B // Watch, Hourglass
1036        | 0x2329..=0x232A // Angle Brackets
1037        | 0x23E9..=0x23F3 // Various symbols
1038        | 0x23F8..=0x23FA
1039        | 0x25FD..=0x25FE
1040        | 0x2614..=0x2615
1041        | 0x2648..=0x2653
1042        | 0x267F
1043        | 0x2693
1044        | 0x26A1
1045        | 0x26AA..=0x26AB
1046        | 0x26BD..=0x26BE
1047        | 0x26C4..=0x26C5
1048        | 0x26CE
1049        | 0x26D4
1050        | 0x26EA
1051        | 0x26F2..=0x26F3
1052        | 0x26F5
1053        | 0x26FA
1054        | 0x26FD
1055        | 0x2702
1056        | 0x2705
1057        | 0x2708..=0x270D
1058        | 0x270F
1059        | 0x2712
1060        | 0x2714
1061        | 0x2716
1062        | 0x271D
1063        | 0x2721
1064        | 0x2728
1065        | 0x2733..=0x2734
1066        | 0x2744
1067        | 0x2747
1068        | 0x274C
1069        | 0x274E
1070        | 0x2753..=0x2755
1071        | 0x2757
1072        | 0x2763..=0x2764
1073        | 0x2795..=0x2797
1074        | 0x27A1
1075        | 0x27B0
1076        | 0x27BF
1077        | 0x2934..=0x2935
1078        | 0x2B05..=0x2B07
1079        | 0x2B1B..=0x2B1C
1080        | 0x2B50
1081        | 0x2B55
1082        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
1083        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
1084        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1085        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1086        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1087        | 0xAC00..=0xD7A3  // Hangul Syllables
1088        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1089        | 0xFE10..=0xFE19  // Vertical Forms
1090        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1091        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1092        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1093        | 0x1F004
1094        | 0x1F0CF
1095        | 0x1F170..=0x1F171
1096        | 0x1F17E..=0x1F17F
1097        | 0x1F18E
1098        | 0x1F191..=0x1F19A
1099        | 0x1F1E0..=0x1F1FF // Regional Indicators
1100        | 0x1F200..=0x1F202
1101        | 0x1F210..=0x1F23B
1102        | 0x1F240..=0x1F248
1103        | 0x1F250..=0x1F251
1104        | 0x1F260..=0x1F265
1105        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1106        | 0x1F680..=0x1F6FF // Transport Symbols
1107        | 0x1F900..=0x1F9FF // Supplemental Symbols
1108        | 0x1FA00..=0x1FA6F
1109        | 0x1FA70..=0x1FAFF
1110        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1111        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1112    )
1113}
1114
1115/// Compute maximum display width of any line (C/POSIX locale).
1116///
1117/// GNU wc -L behavior in C locale:
1118/// - `\n`: line terminator (records max, resets position)
1119/// - `\t`: advances to next tab stop (multiple of 8)
1120/// - `\r`: carriage return (resets position to 0, same line)
1121/// - `\f`: form feed (acts as line terminator like \n)
1122/// - Printable ASCII (0x20..0x7E): width 1
1123/// - Everything else (controls, high bytes): width 0
1124///
1125/// Optimized with printable ASCII run counting: for runs of bytes in
1126/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1127pub fn max_line_length_c(data: &[u8]) -> u64 {
1128    let mut max_len: u64 = 0;
1129    let mut line_len: u64 = 0;
1130    let mut linepos: u64 = 0;
1131    let mut i = 0;
1132    let len = data.len();
1133
1134    while i < len {
1135        let b = unsafe { *data.get_unchecked(i) };
1136        if b >= 0x21 && b <= 0x7E {
1137            // Printable non-space ASCII — count run length
1138            i += 1;
1139            let mut run = 1u64;
1140            while i < len {
1141                let b = unsafe { *data.get_unchecked(i) };
1142                if b >= 0x21 && b <= 0x7E {
1143                    run += 1;
1144                    i += 1;
1145                } else {
1146                    break;
1147                }
1148            }
1149            linepos += run;
1150            if linepos > line_len {
1151                line_len = linepos;
1152            }
1153        } else {
1154            match b {
1155                b' ' => {
1156                    linepos += 1;
1157                    if linepos > line_len {
1158                        line_len = linepos;
1159                    }
1160                }
1161                b'\n' => {
1162                    if line_len > max_len {
1163                        max_len = line_len;
1164                    }
1165                    linepos = 0;
1166                    line_len = 0;
1167                }
1168                b'\t' => {
1169                    linepos = (linepos + 8) & !7;
1170                    if linepos > line_len {
1171                        line_len = linepos;
1172                    }
1173                }
1174                b'\r' => {
1175                    linepos = 0;
1176                }
1177                0x0C => {
1178                    if line_len > max_len {
1179                        max_len = line_len;
1180                    }
1181                    linepos = 0;
1182                    line_len = 0;
1183                }
1184                _ => {} // Non-printable: width 0
1185            }
1186            i += 1;
1187        }
1188    }
1189
1190    if line_len > max_len {
1191        max_len = line_len;
1192    }
1193
1194    max_len
1195}
1196
1197/// Compute maximum display width of any line (UTF-8 locale).
1198///
1199/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1200/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1201///
1202/// Optimized with printable ASCII run counting for common text.
1203pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1204    let mut max_len: u64 = 0;
1205    let mut line_len: u64 = 0;
1206    let mut linepos: u64 = 0;
1207    let mut i = 0;
1208    let len = data.len();
1209
1210    while i < len {
1211        let b = unsafe { *data.get_unchecked(i) };
1212
1213        if b >= 0x21 && b <= 0x7E {
1214            // Printable non-space ASCII (most common) — count run length
1215            i += 1;
1216            let mut run = 1u64;
1217            while i < len {
1218                let b = unsafe { *data.get_unchecked(i) };
1219                if b >= 0x21 && b <= 0x7E {
1220                    run += 1;
1221                    i += 1;
1222                } else {
1223                    break;
1224                }
1225            }
1226            linepos += run;
1227            if linepos > line_len {
1228                line_len = linepos;
1229            }
1230        } else if b < 0x80 {
1231            // Other ASCII: space, tab, newline, controls
1232            match b {
1233                b' ' => {
1234                    linepos += 1;
1235                    if linepos > line_len {
1236                        line_len = linepos;
1237                    }
1238                }
1239                b'\n' => {
1240                    if line_len > max_len {
1241                        max_len = line_len;
1242                    }
1243                    linepos = 0;
1244                    line_len = 0;
1245                }
1246                b'\t' => {
1247                    linepos = (linepos + 8) & !7;
1248                    if linepos > line_len {
1249                        line_len = linepos;
1250                    }
1251                }
1252                b'\r' => {
1253                    linepos = 0;
1254                }
1255                0x0C => {
1256                    if line_len > max_len {
1257                        max_len = line_len;
1258                    }
1259                    linepos = 0;
1260                    line_len = 0;
1261                }
1262                _ => {} // Non-printable: width 0
1263            }
1264            i += 1;
1265        } else {
1266            // Multibyte UTF-8
1267            let (cp, blen) = decode_utf8(&data[i..]);
1268
1269            // C1 control characters (0x80..0x9F): non-printable, width 0
1270            if cp <= 0x9F {
1271                // width 0
1272            } else if is_zero_width(cp) {
1273                // Combining marks, zero-width chars: width 0
1274            } else if is_wide_char(cp) {
1275                linepos += 2;
1276                if linepos > line_len {
1277                    line_len = linepos;
1278                }
1279            } else {
1280                // Regular printable Unicode character: width 1
1281                linepos += 1;
1282                if linepos > line_len {
1283                    line_len = linepos;
1284                }
1285            }
1286            i += blen;
1287        }
1288    }
1289
1290    // Handle last line
1291    if line_len > max_len {
1292        max_len = line_len;
1293    }
1294
1295    max_len
1296}
1297
1298/// Compute maximum display width, choosing behavior based on locale.
1299#[inline]
1300pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1301    if utf8 {
1302        max_line_length_utf8(data)
1303    } else {
1304        max_line_length_c(data)
1305    }
1306}
1307
1308/// Count all metrics using optimized individual passes.
1309///
1310/// Each metric uses its own optimized algorithm:
1311/// - Lines: SIMD-accelerated memchr
1312/// - Words: 2-state scalar/state-machine (locale-dependent)
1313/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1314/// - Max line length: locale-aware display width tracking
1315///
1316/// Multi-pass is faster than single-pass because each pass has a tight,
1317/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1318/// making subsequent passes nearly free for memory bandwidth.
1319pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1320    if utf8 {
1321        let (lines, words) = count_lines_words_utf8_fused(data);
1322        WcCounts {
1323            lines,
1324            words,
1325            bytes: data.len() as u64,
1326            chars: count_chars_utf8(data),
1327            max_line_length: max_line_length_utf8(data),
1328        }
1329    } else {
1330        WcCounts {
1331            lines: count_lines(data),
1332            words: count_words_locale(data, false),
1333            bytes: data.len() as u64,
1334            chars: data.len() as u64,
1335            max_line_length: max_line_length_c(data),
1336        }
1337    }
1338}
1339
1340/// Quick check if data is likely all-ASCII by sampling three regions.
1341/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1342/// If any byte >= 0x80 is found, returns false.
1343#[inline]
1344fn check_ascii_sample(data: &[u8]) -> bool {
1345    let len = data.len();
1346    if len == 0 {
1347        return true;
1348    }
1349
1350    // Check in 8-byte blocks using OR-accumulation for speed
1351    let check_region = |start: usize, end: usize| -> bool {
1352        let mut or_acc = 0u8;
1353        let region = &data[start..end];
1354        let mut i = 0;
1355        while i + 8 <= region.len() {
1356            unsafe {
1357                or_acc |= *region.get_unchecked(i);
1358                or_acc |= *region.get_unchecked(i + 1);
1359                or_acc |= *region.get_unchecked(i + 2);
1360                or_acc |= *region.get_unchecked(i + 3);
1361                or_acc |= *region.get_unchecked(i + 4);
1362                or_acc |= *region.get_unchecked(i + 5);
1363                or_acc |= *region.get_unchecked(i + 6);
1364                or_acc |= *region.get_unchecked(i + 7);
1365            }
1366            i += 8;
1367        }
1368        while i < region.len() {
1369            or_acc |= region[i];
1370            i += 1;
1371        }
1372        or_acc < 0x80
1373    };
1374
1375    let sample = 256.min(len);
1376
1377    // Check beginning
1378    if !check_region(0, sample) {
1379        return false;
1380    }
1381    // Check middle
1382    if len > sample * 2 {
1383        let mid = len / 2;
1384        let mid_start = mid.saturating_sub(sample / 2);
1385        if !check_region(mid_start, (mid_start + sample).min(len)) {
1386            return false;
1387        }
1388    }
1389    // Check end
1390    if len > sample {
1391        if !check_region(len - sample, len) {
1392            return false;
1393        }
1394    }
1395
1396    true
1397}
1398
1399// ──────────────────────────────────────────────────
1400// Parallel counting for large files
1401// ──────────────────────────────────────────────────
1402
1403/// Split data into chunks at newline boundaries for parallel processing.
1404/// Returns slices where each slice (except possibly the last) ends with `\n`.
1405/// Splitting at newlines guarantees word boundaries in any locale,
1406/// enabling safe parallel word counting without boundary adjustment.
1407fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1408    if data.is_empty() || num_chunks <= 1 {
1409        return vec![data];
1410    }
1411    let chunk_size = data.len() / num_chunks;
1412    let mut chunks = Vec::with_capacity(num_chunks);
1413    let mut pos = 0;
1414
1415    for _ in 0..num_chunks - 1 {
1416        let target = pos + chunk_size;
1417        if target >= data.len() {
1418            break;
1419        }
1420        let boundary = memchr::memchr(b'\n', &data[target..])
1421            .map(|p| target + p + 1)
1422            .unwrap_or(data.len());
1423        if boundary > pos {
1424            chunks.push(&data[pos..boundary]);
1425        }
1426        pos = boundary;
1427    }
1428    if pos < data.len() {
1429        chunks.push(&data[pos..]);
1430    }
1431    chunks
1432}
1433
1434/// Count newlines in parallel using SIMD memchr + rayon.
1435/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1436pub fn count_lines_parallel(data: &[u8]) -> u64 {
1437    if data.len() < PARALLEL_THRESHOLD {
1438        return count_lines(data);
1439    }
1440
1441    let num_threads = rayon::current_num_threads().max(1);
1442    // Ensure chunks are large enough to amortize SIMD setup overhead
1443    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1444
1445    data.par_chunks(chunk_size)
1446        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1447        .sum()
1448}
1449
1450/// Count words in parallel with boundary adjustment.
1451pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1452    if data.len() < PARALLEL_THRESHOLD {
1453        return count_words_locale(data, utf8);
1454    }
1455
1456    let num_threads = rayon::current_num_threads().max(1);
1457
1458    if utf8 {
1459        // UTF-8: split at newline boundaries for safe parallel word counting.
1460        // Newlines are always word boundaries, so no boundary adjustment needed.
1461        let chunks = split_at_newlines(data, num_threads);
1462        chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1463    } else {
1464        // C locale: parallel 2-state word counting with boundary adjustment
1465        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1466
1467        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1468
1469        // Each chunk returns (lines, word_count, first_is_word, ends_in_word)
1470        let results: Vec<(u64, u64, bool, bool)> = chunks
1471            .par_iter()
1472            .map(|chunk| count_lw_c_chunk(chunk))
1473            .collect();
1474
1475        let mut total = 0u64;
1476        for i in 0..results.len() {
1477            total += results[i].1;
1478            // Boundary adjustment: if previous chunk ended in_word AND
1479            // current chunk's first byte is non-space (word content),
1480            // the word was split across chunks — subtract the overcount.
1481            if i > 0 && results[i - 1].3 && results[i].2 {
1482                total -= 1;
1483            }
1484        }
1485        total
1486    }
1487}
1488
1489/// Count UTF-8 characters in parallel.
1490pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1491    if !utf8 {
1492        return data.len() as u64;
1493    }
1494    if data.len() < PARALLEL_THRESHOLD {
1495        return count_chars_utf8(data);
1496    }
1497
1498    let num_threads = rayon::current_num_threads().max(1);
1499    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1500
1501    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1502}
1503
1504/// Count lines + words + bytes in a single fused pass (the default wc mode).
1505/// Avoids separate passes entirely — combines newline counting with word detection.
1506pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1507    let (lines, words) = count_lines_words(data, utf8);
1508    (lines, words, data.len() as u64)
1509}
1510
1511/// Parallel counting of lines + words + bytes only (no chars).
1512/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1513/// C locale: single fused pass per chunk counts BOTH lines and words.
1514/// UTF-8: checks ASCII first for C locale fast path, else splits at newlines
1515/// for safe parallel UTF-8 word counting.
1516pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1517    if data.len() < PARALLEL_THRESHOLD {
1518        // Small file: use fused single-pass
1519        return count_lwb(data, utf8);
1520    }
1521
1522    let num_threads = rayon::current_num_threads().max(1);
1523
1524    let (lines, words) = if !utf8 {
1525        // C locale: FUSED parallel lines+words counting — single pass per chunk
1526        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1527
1528        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1529        let results: Vec<(u64, u64, bool, bool)> = chunks
1530            .par_iter()
1531            .map(|chunk| count_lw_c_chunk_fast(chunk))
1532            .collect();
1533
1534        let mut line_total = 0u64;
1535        let mut word_total = 0u64;
1536        for i in 0..results.len() {
1537            line_total += results[i].0;
1538            word_total += results[i].1;
1539            if i > 0 && results[i - 1].3 && results[i].2 {
1540                word_total -= 1;
1541            }
1542        }
1543
1544        (line_total, word_total)
1545    } else {
1546        // UTF-8 locale: check if ASCII for faster C locale path
1547        let is_ascii = check_ascii_sample(data);
1548        if is_ascii {
1549            // Pure ASCII: use C locale parallel path (arbitrary chunks OK)
1550            let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1551            let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1552            let results: Vec<(u64, u64, bool, bool)> = chunks
1553                .par_iter()
1554                .map(|chunk| count_lw_c_chunk_fast(chunk))
1555                .collect();
1556
1557            let mut line_total = 0u64;
1558            let mut word_total = 0u64;
1559            for i in 0..results.len() {
1560                line_total += results[i].0;
1561                word_total += results[i].1;
1562                if i > 0 && results[i - 1].3 && results[i].2 {
1563                    word_total -= 1;
1564                }
1565            }
1566            (line_total, word_total)
1567        } else {
1568            // Non-ASCII UTF-8: split at newline boundaries for safe parallel
1569            // word counting. Newlines always break words, so no adjustment needed.
1570            let chunks = split_at_newlines(data, num_threads);
1571            let results: Vec<(u64, u64)> = chunks
1572                .par_iter()
1573                .map(|chunk| count_lines_words_utf8_fused(chunk))
1574                .collect();
1575            let mut line_total = 0u64;
1576            let mut word_total = 0u64;
1577            for (l, w) in results {
1578                line_total += l;
1579                word_total += w;
1580            }
1581            (line_total, word_total)
1582        }
1583    };
1584
1585    (lines, words, data.len() as u64)
1586}
1587
1588/// Combined parallel counting of lines + words + chars.
1589/// UTF-8: splits at newline boundaries for fused lines+words+chars per chunk.
1590/// C locale: fused parallel lines+words with boundary adjustment + parallel chars.
1591pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1592    if data.len() < PARALLEL_THRESHOLD {
1593        let lines = count_lines(data);
1594        let words = count_words_locale(data, utf8);
1595        let chars = count_chars(data, utf8);
1596        return (lines, words, chars);
1597    }
1598
1599    let num_threads = rayon::current_num_threads().max(1);
1600
1601    if utf8 {
1602        // UTF-8: fused parallel lines+words+chars per chunk (split at newlines)
1603        let chunks = split_at_newlines(data, num_threads);
1604        let results: Vec<(u64, u64, u64)> = chunks
1605            .par_iter()
1606            .map(|chunk| {
1607                let (lines, words) = count_lines_words_utf8_fused(chunk);
1608                let chars = count_chars_utf8(chunk);
1609                (lines, words, chars)
1610            })
1611            .collect();
1612        let mut lines = 0u64;
1613        let mut words = 0u64;
1614        let mut chars = 0u64;
1615        for (l, w, c) in results {
1616            lines += l;
1617            words += w;
1618            chars += c;
1619        }
1620        (lines, words, chars)
1621    } else {
1622        // C locale: fused parallel lines+words + parallel chars (= byte count)
1623        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1624        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1625        let results: Vec<(u64, u64, bool, bool)> = chunks
1626            .par_iter()
1627            .map(|chunk| count_lw_c_chunk_fast(chunk))
1628            .collect();
1629        let mut lines = 0u64;
1630        let mut words = 0u64;
1631        for i in 0..results.len() {
1632            lines += results[i].0;
1633            words += results[i].1;
1634            if i > 0 && results[i - 1].3 && results[i].2 {
1635                words -= 1;
1636            }
1637        }
1638        (lines, words, data.len() as u64)
1639    }
1640}
1641
1642/// Parallel max line length computation.
1643/// Splits at newline boundaries so each chunk independently computes correct
1644/// max line width (since newlines reset position tracking).
1645pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1646    if data.len() < PARALLEL_THRESHOLD {
1647        return max_line_length(data, utf8);
1648    }
1649    let num_threads = rayon::current_num_threads().max(1);
1650    let chunks = split_at_newlines(data, num_threads);
1651    chunks
1652        .par_iter()
1653        .map(|chunk| {
1654            if utf8 {
1655                max_line_length_utf8(chunk)
1656            } else {
1657                max_line_length_c(chunk)
1658            }
1659        })
1660        .max()
1661        .unwrap_or(0)
1662}
1663
1664/// Parallel counting of all metrics at once.
1665/// Splits at newline boundaries for safe parallel word + max_line_length counting.
1666/// Each chunk computes all metrics in a single traversal group, maximizing cache reuse.
1667pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1668    if data.len() < PARALLEL_THRESHOLD {
1669        return count_all(data, utf8);
1670    }
1671
1672    let num_threads = rayon::current_num_threads().max(1);
1673    let chunks = split_at_newlines(data, num_threads);
1674
1675    if utf8 {
1676        let results: Vec<(u64, u64, u64, u64)> = chunks
1677            .par_iter()
1678            .map(|chunk| {
1679                let (lines, words) = count_lines_words_utf8_fused(chunk);
1680                let chars = count_chars_utf8(chunk);
1681                let max_ll = max_line_length_utf8(chunk);
1682                (lines, words, chars, max_ll)
1683            })
1684            .collect();
1685
1686        let mut counts = WcCounts {
1687            bytes: data.len() as u64,
1688            ..Default::default()
1689        };
1690        for (l, w, c, m) in results {
1691            counts.lines += l;
1692            counts.words += w;
1693            counts.chars += c;
1694            if m > counts.max_line_length {
1695                counts.max_line_length = m;
1696            }
1697        }
1698        counts
1699    } else {
1700        // C locale: fused lines+words per chunk + max_line_length per chunk
1701        let results: Vec<(u64, u64, u64)> = chunks
1702            .par_iter()
1703            .map(|chunk| {
1704                let (lines, words) = count_lines_words(chunk, false);
1705                let max_ll = max_line_length_c(chunk);
1706                (lines, words, max_ll)
1707            })
1708            .collect();
1709
1710        let mut counts = WcCounts {
1711            bytes: data.len() as u64,
1712            chars: data.len() as u64,
1713            ..Default::default()
1714        };
1715        for (l, w, m) in &results {
1716            counts.lines += l;
1717            counts.words += w;
1718            if *m > counts.max_line_length {
1719                counts.max_line_length = *m;
1720            }
1721        }
1722        counts
1723    }
1724}