Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (1MB).
5/// Rayon overhead is ~5-10μs per task; at 1MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~100μs, so overhead is < 10%.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// 3-state byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// GNU wc uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
24//   0 = printable (word content): starts or continues a word
25//   1 = space (word break): ends any current word
26//   2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
27//
28// The critical difference from 2-state is that transparent characters
29// (NUL, control chars, invalid UTF-8) do NOT break words.
30// Example: "hello\x00world" is 1 word (NUL is transparent), not 2.
31
32/// Byte classification for C/POSIX locale word counting.
33/// GNU wc in C locale uses a simple 2-state model:
34///   - Space bytes (\\t, \\n, \\v, \\f, \\r, space) break words
35///   - ALL other bytes (including controls, NUL, bytes >= 0x80) form words
36/// This matches GNU behavior where bytes >= 0x80 and control characters
37/// are treated as word content, not transparent.
38const fn make_byte_class_c() -> [u8; 256] {
39    let mut t = [0u8; 256]; // default: word content (printable)
40    // Spaces: break words
41    t[0x09] = 1; // \t
42    t[0x0A] = 1; // \n
43    t[0x0B] = 1; // \v
44    t[0x0C] = 1; // \f
45    t[0x0D] = 1; // \r
46    t[0x20] = 1; // space
47    t
48}
49
50const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
51
52/// 3-state single-byte classification for UTF-8 locale.
53/// Multi-byte UTF-8 sequences are handled by the state machine separately.
54const fn make_byte_class_utf8() -> [u8; 256] {
55    let mut t = [2u8; 256]; // default: transparent
56    // Spaces
57    t[0x09] = 1; // \t
58    t[0x0A] = 1; // \n
59    t[0x0B] = 1; // \v
60    t[0x0C] = 1; // \f
61    t[0x0D] = 1; // \r
62    t[0x20] = 1; // space
63    // Printable ASCII (0x21-0x7E): word content
64    let mut i = 0x21u16;
65    while i <= 0x7E {
66        t[i as usize] = 0;
67        i += 1;
68    }
69    t
70}
71
72const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
73
74// ──────────────────────────────────────────────────
75// Unicode character classification helpers
76// ──────────────────────────────────────────────────
77
78/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
79/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
80#[inline]
81fn is_unicode_space(cp: u32) -> bool {
82    matches!(
83        cp,
84        0x00A0 |           // No-Break Space
85        0x1680 |           // Ogham Space Mark
86        0x2000
87            ..=0x200A |  // En Quad through Hair Space
88        0x2028 |           // Line Separator
89        0x2029 |           // Paragraph Separator
90        0x202F |           // Narrow No-Break Space
91        0x205F |           // Medium Mathematical Space
92        0x3000 // Ideographic Space
93    )
94}
95
96/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
97/// C1 control characters (U+0080-U+009F) are not printable.
98/// Most characters >= U+00A0 are printable.
99#[inline]
100fn is_unicode_printable(cp: u32) -> bool {
101    cp >= 0xA0
102}
103
104// ──────────────────────────────────────────────────
105// Core counting functions
106// ──────────────────────────────────────────────────
107
108/// Count newlines using SIMD-accelerated memchr.
109/// GNU wc counts newline bytes (`\n`), not logical lines.
110#[inline]
111pub fn count_lines(data: &[u8]) -> u64 {
112    memchr_iter(b'\n', data).count() as u64
113}
114
115/// Count bytes. Trivial but included for API consistency.
116#[inline]
117pub fn count_bytes(data: &[u8]) -> u64 {
118    data.len() as u64
119}
120
121/// Count words using locale-aware 3-state logic (default: UTF-8).
122pub fn count_words(data: &[u8]) -> u64 {
123    count_words_locale(data, true)
124}
125
126/// Count words with explicit locale control using 3-state logic.
127///
128/// GNU wc classifies each character as:
129///   - space (iswspace=true): sets in_word=false
130///   - printable (iswprint=true): sets in_word=true, increments word count on transition
131///   - transparent (neither): leaves in_word unchanged
132pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
133    if utf8 {
134        count_words_utf8(data)
135    } else {
136        count_words_c(data)
137    }
138}
139
140/// Count words in C/POSIX locale using simple 2-state logic.
141/// GNU wc in C locale: space bytes break words, ALL other bytes form words.
142/// This includes control characters, NUL, bytes >= 0x80 — they all start/continue words.
143///
144/// Optimized with non-space run skipping.
145fn count_words_c(data: &[u8]) -> u64 {
146    let mut words = 0u64;
147    let mut in_word = false;
148    let mut i = 0;
149    let len = data.len();
150
151    while i < len {
152        let b = unsafe { *data.get_unchecked(i) };
153        let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
154        if class == 1 {
155            // Space — break word
156            in_word = false;
157            i += 1;
158        } else {
159            // Non-space — word content
160            if !in_word {
161                in_word = true;
162                words += 1;
163            }
164            i += 1;
165            // Skip remaining non-space bytes
166            while i < len {
167                let b = unsafe { *data.get_unchecked(i) };
168                let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
169                if class == 0 {
170                    i += 1;
171                } else {
172                    break;
173                }
174            }
175        }
176    }
177    words
178}
179
180/// AVX2-accelerated fused line+word counter for C locale chunks.
181/// Processes 32 bytes per iteration: classifies word characters as any
182/// non-space byte (space = 0x09-0x0D, 0x20), counts word-start transitions
183/// via bitmask, counts newlines via SIMD accumulation with periodic horizontal sum.
184/// GNU wc C locale: ALL non-space bytes are word content.
185#[cfg(target_arch = "x86_64")]
186#[target_feature(enable = "avx2")]
187unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
188    use std::arch::x86_64::*;
189
190    let len = data.len();
191    let ptr = data.as_ptr();
192    let mut i = 0usize;
193    let mut total_lines = 0u64;
194    let mut total_words = 0u64;
195    let mut prev_was_word = false;
196
197    unsafe {
198        let nl_byte = _mm256_set1_epi8(b'\n' as i8);
199        let zero = _mm256_setzero_si256();
200        let ones = _mm256_set1_epi8(1);
201        // Space detection: space bytes are 0x09-0x0D and 0x20
202        let space_char = _mm256_set1_epi8(0x20i8);
203        let tab_lo = _mm256_set1_epi8(0x08i8); // 0x09 - 1
204        let tab_hi = _mm256_set1_epi8(0x0Ei8); // 0x0D + 1
205
206        let mut line_acc = _mm256_setzero_si256();
207        let mut batch = 0u32;
208
209        while i + 32 <= len {
210            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
211            let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
212            line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
213
214            // is_space = (v == 0x20) | (v > 0x08 && v < 0x0E)
215            let is_sp = _mm256_cmpeq_epi8(v, space_char);
216            let gt_08 = _mm256_cmpgt_epi8(v, tab_lo);
217            let lt_0e = _mm256_cmpgt_epi8(tab_hi, v);
218            let is_tab_range = _mm256_and_si256(gt_08, lt_0e);
219            let is_space = _mm256_or_si256(is_sp, is_tab_range);
220            // is_word = NOT is_space
221            let is_word = _mm256_andnot_si256(is_space, _mm256_set1_epi8(-1));
222
223            let word_mask = _mm256_movemask_epi8(is_word) as u32;
224            let prev_mask = (word_mask << 1) | (prev_was_word as u32);
225            total_words += (word_mask & !prev_mask).count_ones() as u64;
226            prev_was_word = (word_mask >> 31) & 1 == 1;
227
228            batch += 1;
229            if batch >= 255 {
230                let sad = _mm256_sad_epu8(line_acc, zero);
231                let hi = _mm256_extracti128_si256(sad, 1);
232                let lo = _mm256_castsi256_si128(sad);
233                let s = _mm_add_epi64(lo, hi);
234                let h64 = _mm_unpackhi_epi64(s, s);
235                let t = _mm_add_epi64(s, h64);
236                total_lines += _mm_cvtsi128_si64(t) as u64;
237                line_acc = _mm256_setzero_si256();
238                batch = 0;
239            }
240            i += 32;
241        }
242
243        if batch > 0 {
244            let sad = _mm256_sad_epu8(line_acc, zero);
245            let hi = _mm256_extracti128_si256(sad, 1);
246            let lo = _mm256_castsi256_si128(sad);
247            let s = _mm_add_epi64(lo, hi);
248            let h64 = _mm_unpackhi_epi64(s, s);
249            let t = _mm_add_epi64(s, h64);
250            total_lines += _mm_cvtsi128_si64(t) as u64;
251        }
252
253        // Scalar tail using lookup table
254        while i < len {
255            let b = *ptr.add(i);
256            if b == b'\n' {
257                total_lines += 1;
258                prev_was_word = false;
259            } else if *BYTE_CLASS_C.get_unchecked(b as usize) == 1 {
260                // Other space byte
261                prev_was_word = false;
262            } else {
263                // Non-space: word content
264                if !prev_was_word {
265                    total_words += 1;
266                }
267                prev_was_word = true;
268            }
269            i += 1;
270        }
271    }
272
273    let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] == 0;
274    (total_lines, total_words, first_is_word, prev_was_word)
275}
276
277/// SSE2-accelerated fused line+word counter for C locale chunks.
278/// Same algorithm as AVX2 but processes 16 bytes per iteration.
279/// Available on all x86_64 CPUs (SSE2 is baseline for x86_64).
280/// GNU wc C locale: ALL non-space bytes are word content.
281#[cfg(target_arch = "x86_64")]
282#[target_feature(enable = "sse2")]
283unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
284    use std::arch::x86_64::*;
285
286    let len = data.len();
287    let ptr = data.as_ptr();
288    let mut i = 0usize;
289    let mut total_lines = 0u64;
290    let mut total_words = 0u64;
291    let mut prev_was_word = false;
292
293    unsafe {
294        let nl_byte = _mm_set1_epi8(b'\n' as i8);
295        let zero = _mm_setzero_si128();
296        let ones = _mm_set1_epi8(1);
297        // Space detection: space bytes are 0x09-0x0D and 0x20
298        let space_char = _mm_set1_epi8(0x20i8);
299        let tab_lo = _mm_set1_epi8(0x08i8);
300        let tab_hi = _mm_set1_epi8(0x0Ei8);
301
302        let mut line_acc = _mm_setzero_si128();
303        let mut batch = 0u32;
304
305        while i + 16 <= len {
306            let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
307            let is_nl = _mm_cmpeq_epi8(v, nl_byte);
308            line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
309
310            // is_space = (v == 0x20) | (v > 0x08 && v < 0x0E)
311            let is_sp = _mm_cmpeq_epi8(v, space_char);
312            let gt_08 = _mm_cmpgt_epi8(v, tab_lo);
313            let lt_0e = _mm_cmpgt_epi8(tab_hi, v);
314            let is_tab_range = _mm_and_si128(gt_08, lt_0e);
315            let is_space = _mm_or_si128(is_sp, is_tab_range);
316            // is_word = NOT is_space
317            let is_word = _mm_andnot_si128(is_space, _mm_set1_epi8(-1));
318
319            let word_mask = _mm_movemask_epi8(is_word) as u32;
320            let prev_mask = (word_mask << 1) | (prev_was_word as u32);
321            total_words += (word_mask & !prev_mask).count_ones() as u64;
322            prev_was_word = (word_mask >> 15) & 1 == 1;
323
324            batch += 1;
325            if batch >= 255 {
326                let sad = _mm_sad_epu8(line_acc, zero);
327                let hi = _mm_unpackhi_epi64(sad, sad);
328                let t = _mm_add_epi64(sad, hi);
329                total_lines += _mm_cvtsi128_si64(t) as u64;
330                line_acc = _mm_setzero_si128();
331                batch = 0;
332            }
333            i += 16;
334        }
335
336        if batch > 0 {
337            let sad = _mm_sad_epu8(line_acc, zero);
338            let hi = _mm_unpackhi_epi64(sad, sad);
339            let t = _mm_add_epi64(sad, hi);
340            total_lines += _mm_cvtsi128_si64(t) as u64;
341        }
342
343        // Scalar tail using lookup table
344        while i < len {
345            let b = *ptr.add(i);
346            if b == b'\n' {
347                total_lines += 1;
348                prev_was_word = false;
349            } else if *BYTE_CLASS_C.get_unchecked(b as usize) == 1 {
350                prev_was_word = false;
351            } else {
352                if !prev_was_word {
353                    total_words += 1;
354                }
355                prev_was_word = true;
356            }
357            i += 1;
358        }
359    }
360
361    let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] == 0;
362    (total_lines, total_words, first_is_word, prev_was_word)
363}
364
365/// Dispatch to AVX2, SSE2, or scalar chunk counter.
366#[inline]
367fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
368    #[cfg(target_arch = "x86_64")]
369    {
370        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
371            return unsafe { count_lw_c_chunk_avx2(data) };
372        }
373        if data.len() >= 32 {
374            return unsafe { count_lw_c_chunk_sse2(data) };
375        }
376    }
377    count_lw_c_chunk(data)
378}
379
380/// Count words + lines in a C locale chunk, returning counts plus boundary info.
381/// Used by parallel word counting.
382/// Returns (line_count, word_count, first_is_word, ends_in_word).
383/// GNU wc C locale: ALL non-space bytes are word content.
384fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
385    let mut lines = 0u64;
386    let mut words = 0u64;
387    let mut in_word = false;
388    let mut first_is_word = false;
389    let mut seen_first = false;
390    let mut i = 0;
391    let len = data.len();
392
393    while i < len {
394        let b = unsafe { *data.get_unchecked(i) };
395        let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
396        if class == 1 {
397            // Space byte — break word
398            if !seen_first {
399                seen_first = true;
400                // first_is_word stays false (space is not word content)
401            }
402            if b == b'\n' {
403                lines += 1;
404            }
405            in_word = false;
406            i += 1;
407        } else {
408            // Non-space byte — word content
409            if !seen_first {
410                seen_first = true;
411                first_is_word = true;
412            }
413            if !in_word {
414                in_word = true;
415                words += 1;
416            }
417            i += 1;
418            // Skip remaining non-space bytes
419            while i < len {
420                let b = unsafe { *data.get_unchecked(i) };
421                let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
422                if class == 0 {
423                    i += 1;
424                } else {
425                    break;
426                }
427            }
428        }
429    }
430    (lines, words, first_is_word, in_word)
431}
432
433/// Count words in UTF-8 locale using a state machine with 3-state logic.
434///
435/// Handles:
436/// - ASCII spaces (0x09-0x0D, 0x20): word break
437/// - ASCII printable (0x21-0x7E): word content
438/// - ASCII non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
439/// - Valid UTF-8 multi-byte → check Unicode space/printable
440/// - Invalid UTF-8: transparent (GNU wc skips invalid bytes without changing state)
441///
442/// Optimized with ASCII run skipping: when a word starts, skips remaining
443/// printable ASCII bytes without per-byte table lookups (~4x fewer state checks
444/// for English text with 5-char average word length).
445fn count_words_utf8(data: &[u8]) -> u64 {
446    let mut words = 0u64;
447    let mut in_word = false;
448    let mut i = 0;
449    let len = data.len();
450
451    while i < len {
452        let b = unsafe { *data.get_unchecked(i) };
453
454        if b >= 0x21 && b <= 0x7E {
455            // Printable ASCII (most common case for text) — word content
456            if !in_word {
457                in_word = true;
458                words += 1;
459            }
460            i += 1;
461            // Skip remaining printable ASCII (they don't change state)
462            while i < len {
463                let b = unsafe { *data.get_unchecked(i) };
464                if b >= 0x21 && b <= 0x7E {
465                    i += 1;
466                } else {
467                    break;
468                }
469            }
470        } else if b < 0x80 {
471            // Non-printable ASCII: space/tab/newline/controls
472            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
473            if class == 1 {
474                in_word = false;
475            }
476            // class == 2: transparent (controls 0x00-0x08, 0x0E-0x1F, 0x7F)
477            i += 1;
478        } else if b < 0xC2 {
479            i += 1;
480        } else if b < 0xE0 {
481            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
482                let cp = ((b as u32 & 0x1F) << 6)
483                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
484                if is_unicode_space(cp) {
485                    in_word = false;
486                } else if is_unicode_printable(cp) {
487                    if !in_word {
488                        in_word = true;
489                        words += 1;
490                    }
491                }
492                i += 2;
493            } else {
494                i += 1;
495            }
496        } else if b < 0xF0 {
497            if i + 2 < len
498                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
499                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
500            {
501                let cp = ((b as u32 & 0x0F) << 12)
502                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
503                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
504                if is_unicode_space(cp) {
505                    in_word = false;
506                } else if is_unicode_printable(cp) {
507                    if !in_word {
508                        in_word = true;
509                        words += 1;
510                    }
511                }
512                i += 3;
513            } else {
514                i += 1;
515            }
516        } else if b < 0xF5 {
517            if i + 3 < len
518                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
519                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
520                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
521            {
522                let cp = ((b as u32 & 0x07) << 18)
523                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
524                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
525                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
526                if is_unicode_space(cp) {
527                    in_word = false;
528                } else if is_unicode_printable(cp) {
529                    if !in_word {
530                        in_word = true;
531                        words += 1;
532                    }
533                }
534                i += 4;
535            } else {
536                i += 1;
537            }
538        } else {
539            i += 1;
540        }
541    }
542
543    words
544}
545
546/// Count lines and words using optimized strategies per locale.
547/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
548/// C locale: AVX2 SIMD fused counter when available, scalar fallback otherwise.
549pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
550    if utf8 {
551        count_lines_words_utf8_fused(data)
552    } else {
553        let (lines, words, _, _) = count_lw_c_chunk_fast(data);
554        (lines, words)
555    }
556}
557
558/// Fused lines+words counting in UTF-8 mode (single pass).
559/// Avoids separate memchr pass for newlines by counting them inline with words.
560///
561/// Key optimization: ASCII run skipping. Once a word starts (printable ASCII byte),
562/// we skip remaining printable ASCII bytes without any per-byte state checks.
563/// For English text (avg word ~5 chars), this reduces state transitions by ~4x.
564fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
565    let mut lines = 0u64;
566    let mut words = 0u64;
567    let mut in_word = false;
568    let mut i = 0;
569    let len = data.len();
570
571    while i < len {
572        let b = unsafe { *data.get_unchecked(i) };
573
574        if b >= 0x21 && b <= 0x7E {
575            // Printable ASCII (most common) — word content
576            if !in_word {
577                in_word = true;
578                words += 1;
579            }
580            i += 1;
581            // Skip remaining printable ASCII (they don't change state or count lines)
582            while i < len {
583                let b = unsafe { *data.get_unchecked(i) };
584                if b >= 0x21 && b <= 0x7E {
585                    i += 1;
586                } else {
587                    break;
588                }
589            }
590        } else if b == b'\n' {
591            lines += 1;
592            in_word = false;
593            i += 1;
594        } else if b == b' ' {
595            in_word = false;
596            i += 1;
597        } else if b < 0x80 {
598            // Other ASCII: \t, \r, \v, \f, controls
599            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
600            if class == 1 {
601                in_word = false;
602            }
603            // class == 2: transparent
604            i += 1;
605        } else if b < 0xC2 {
606            i += 1;
607        } else if b < 0xE0 {
608            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
609                let cp = ((b as u32 & 0x1F) << 6)
610                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
611                if is_unicode_space(cp) {
612                    in_word = false;
613                } else if is_unicode_printable(cp) {
614                    if !in_word {
615                        in_word = true;
616                        words += 1;
617                    }
618                }
619                i += 2;
620            } else {
621                i += 1;
622            }
623        } else if b < 0xF0 {
624            if i + 2 < len
625                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
626                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
627            {
628                let cp = ((b as u32 & 0x0F) << 12)
629                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
630                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
631                if is_unicode_space(cp) {
632                    in_word = false;
633                } else if is_unicode_printable(cp) {
634                    if !in_word {
635                        in_word = true;
636                        words += 1;
637                    }
638                }
639                i += 3;
640            } else {
641                i += 1;
642            }
643        } else if b < 0xF5 {
644            if i + 3 < len
645                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
646                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
647                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
648            {
649                let cp = ((b as u32 & 0x07) << 18)
650                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
651                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
652                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
653                if is_unicode_space(cp) {
654                    in_word = false;
655                } else if is_unicode_printable(cp) {
656                    if !in_word {
657                        in_word = true;
658                        words += 1;
659                    }
660                }
661                i += 4;
662            } else {
663                i += 1;
664            }
665        } else {
666            i += 1;
667        }
668    }
669
670    (lines, words)
671}
672
673/// Count lines, words, and chars using optimized strategies per locale.
674pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
675    if utf8 {
676        // Fused single-pass for lines+words, then fast char-counting pass
677        let (lines, words) = count_lines_words_utf8_fused(data);
678        let chars = count_chars_utf8(data);
679        (lines, words, chars)
680    } else {
681        // C locale: use optimized fused lines+words, chars = byte count
682        let (lines, words) = count_lines_words(data, false);
683        (lines, words, data.len() as u64)
684    }
685}
686
687/// Count UTF-8 characters by counting non-continuation bytes.
688/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
689/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
690///
691/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
692/// Falls back to 64-byte block processing with popcount on other architectures.
693pub fn count_chars_utf8(data: &[u8]) -> u64 {
694    #[cfg(target_arch = "x86_64")]
695    {
696        if is_x86_feature_detected!("avx2") {
697            return unsafe { count_chars_utf8_avx2(data) };
698        }
699    }
700    count_chars_utf8_scalar(data)
701}
702
703/// AVX2 SIMD character counter: counts non-continuation bytes using
704/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
705/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
706#[cfg(target_arch = "x86_64")]
707#[target_feature(enable = "avx2")]
708unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
709    unsafe {
710        use std::arch::x86_64::*;
711
712        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
713        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
714        let ones = _mm256_set1_epi8(1);
715        let zero = _mm256_setzero_si256();
716
717        let mut total = 0u64;
718        let len = data.len();
719        let ptr = data.as_ptr();
720        let mut i = 0;
721        let mut acc = _mm256_setzero_si256();
722        let mut batch = 0u32;
723
724        while i + 32 <= len {
725            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
726            let masked = _mm256_and_si256(v, mask_c0);
727            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
728            let non_cont = _mm256_andnot_si256(is_cont, ones);
729            acc = _mm256_add_epi8(acc, non_cont);
730
731            batch += 1;
732            if batch >= 255 {
733                // Horizontal sum via PSADBW: sum u8 differences against zero
734                let sad = _mm256_sad_epu8(acc, zero);
735                let hi = _mm256_extracti128_si256(sad, 1);
736                let lo = _mm256_castsi256_si128(sad);
737                let sum = _mm_add_epi64(lo, hi);
738                let hi64 = _mm_unpackhi_epi64(sum, sum);
739                let t = _mm_add_epi64(sum, hi64);
740                total += _mm_cvtsi128_si64(t) as u64;
741                acc = _mm256_setzero_si256();
742                batch = 0;
743            }
744            i += 32;
745        }
746
747        // Final horizontal sum
748        if batch > 0 {
749            let sad = _mm256_sad_epu8(acc, zero);
750            let hi = _mm256_extracti128_si256(sad, 1);
751            let lo = _mm256_castsi256_si128(sad);
752            let sum = _mm_add_epi64(lo, hi);
753            let hi64 = _mm_unpackhi_epi64(sum, sum);
754            let t = _mm_add_epi64(sum, hi64);
755            total += _mm_cvtsi128_si64(t) as u64;
756        }
757
758        while i < len {
759            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
760            i += 1;
761        }
762
763        total
764    }
765}
766
767/// Scalar fallback for count_chars_utf8.
768fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
769    let mut count = 0u64;
770    let chunks = data.chunks_exact(64);
771    let remainder = chunks.remainder();
772
773    for chunk in chunks {
774        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
775        let mut any_high = 0u8;
776        let mut i = 0;
777        while i + 8 <= 64 {
778            unsafe {
779                any_high |= *chunk.get_unchecked(i);
780                any_high |= *chunk.get_unchecked(i + 1);
781                any_high |= *chunk.get_unchecked(i + 2);
782                any_high |= *chunk.get_unchecked(i + 3);
783                any_high |= *chunk.get_unchecked(i + 4);
784                any_high |= *chunk.get_unchecked(i + 5);
785                any_high |= *chunk.get_unchecked(i + 6);
786                any_high |= *chunk.get_unchecked(i + 7);
787            }
788            i += 8;
789        }
790        if any_high < 0x80 {
791            count += 64;
792            continue;
793        }
794
795        let mut char_mask = 0u64;
796        i = 0;
797        while i + 7 < 64 {
798            unsafe {
799                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
800                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
801                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
802                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
803                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
804                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
805                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
806                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
807            }
808            i += 8;
809        }
810        count += char_mask.count_ones() as u64;
811    }
812
813    for &b in remainder {
814        count += ((b & 0xC0) != 0x80) as u64;
815    }
816    count
817}
818
819/// Count characters in C/POSIX locale (each byte is one character).
820#[inline]
821pub fn count_chars_c(data: &[u8]) -> u64 {
822    data.len() as u64
823}
824
825/// Count characters, choosing behavior based on locale.
826#[inline]
827pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
828    if utf8 {
829        count_chars_utf8(data)
830    } else {
831        count_chars_c(data)
832    }
833}
834
835/// Detect if the current locale uses UTF-8 encoding.
836pub fn is_utf8_locale() -> bool {
837    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
838        if let Ok(val) = std::env::var(var) {
839            if !val.is_empty() {
840                let lower = val.to_ascii_lowercase();
841                return lower.contains("utf-8") || lower.contains("utf8");
842            }
843        }
844    }
845    false
846}
847
848/// Decode one UTF-8 character from a byte slice.
849/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
850#[inline]
851fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
852    let b0 = bytes[0];
853    if b0 < 0x80 {
854        return (b0 as u32, 1);
855    }
856    if b0 < 0xC2 {
857        // Continuation byte or overlong 2-byte — invalid as start
858        return (b0 as u32, 1);
859    }
860    if b0 < 0xE0 {
861        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
862            return (b0 as u32, 1);
863        }
864        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
865        return (cp, 2);
866    }
867    if b0 < 0xF0 {
868        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
869            return (b0 as u32, 1);
870        }
871        let cp =
872            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
873        return (cp, 3);
874    }
875    if b0 < 0xF5 {
876        if bytes.len() < 4
877            || bytes[1] & 0xC0 != 0x80
878            || bytes[2] & 0xC0 != 0x80
879            || bytes[3] & 0xC0 != 0x80
880        {
881            return (b0 as u32, 1);
882        }
883        let cp = ((b0 as u32 & 0x07) << 18)
884            | ((bytes[1] as u32 & 0x3F) << 12)
885            | ((bytes[2] as u32 & 0x3F) << 6)
886            | (bytes[3] as u32 & 0x3F);
887        return (cp, 4);
888    }
889    (b0 as u32, 1)
890}
891
892/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
893/// GNU wc uses wcwidth() which returns 0 for these. We must match.
894#[inline]
895fn is_zero_width(cp: u32) -> bool {
896    matches!(
897        cp,
898        0x0300..=0x036F   // Combining Diacritical Marks
899        | 0x0483..=0x0489 // Cyrillic combining marks
900        | 0x0591..=0x05BD // Hebrew combining marks
901        | 0x05BF
902        | 0x05C1..=0x05C2
903        | 0x05C4..=0x05C5
904        | 0x05C7
905        | 0x0600..=0x0605 // Arabic number signs
906        | 0x0610..=0x061A // Arabic combining marks
907        | 0x064B..=0x065F // Arabic combining marks
908        | 0x0670
909        | 0x06D6..=0x06DD
910        | 0x06DF..=0x06E4
911        | 0x06E7..=0x06E8
912        | 0x06EA..=0x06ED
913        | 0x070F
914        | 0x0711
915        | 0x0730..=0x074A
916        | 0x07A6..=0x07B0
917        | 0x07EB..=0x07F3
918        | 0x07FD
919        | 0x0816..=0x0819
920        | 0x081B..=0x0823
921        | 0x0825..=0x0827
922        | 0x0829..=0x082D
923        | 0x0859..=0x085B
924        | 0x08D3..=0x08E1
925        | 0x08E3..=0x0902
926        | 0x093A
927        | 0x093C
928        | 0x0941..=0x0948
929        | 0x094D
930        | 0x0951..=0x0957
931        | 0x0962..=0x0963
932        | 0x0981
933        | 0x09BC
934        | 0x09C1..=0x09C4
935        | 0x09CD
936        | 0x09E2..=0x09E3
937        | 0x09FE
938        | 0x0A01..=0x0A02
939        | 0x0A3C
940        | 0x0A41..=0x0A42
941        | 0x0A47..=0x0A48
942        | 0x0A4B..=0x0A4D
943        | 0x0A51
944        | 0x0A70..=0x0A71
945        | 0x0A75
946        | 0x0A81..=0x0A82
947        | 0x0ABC
948        | 0x0AC1..=0x0AC5
949        | 0x0AC7..=0x0AC8
950        | 0x0ACD
951        | 0x0AE2..=0x0AE3
952        | 0x0AFA..=0x0AFF
953        | 0x0B01
954        | 0x0B3C
955        | 0x0B3F
956        | 0x0B41..=0x0B44
957        | 0x0B4D
958        | 0x0B56
959        | 0x0B62..=0x0B63
960        | 0x0B82
961        | 0x0BC0
962        | 0x0BCD
963        | 0x0C00
964        | 0x0C04
965        | 0x0C3E..=0x0C40
966        | 0x0C46..=0x0C48
967        | 0x0C4A..=0x0C4D
968        | 0x0C55..=0x0C56
969        | 0x0C62..=0x0C63
970        | 0x0C81
971        | 0x0CBC
972        | 0x0CBF
973        | 0x0CC6
974        | 0x0CCC..=0x0CCD
975        | 0x0CE2..=0x0CE3
976        | 0x0D00..=0x0D01
977        | 0x0D3B..=0x0D3C
978        | 0x0D41..=0x0D44
979        | 0x0D4D
980        | 0x0D62..=0x0D63
981        | 0x0DCA
982        | 0x0DD2..=0x0DD4
983        | 0x0DD6
984        | 0x0E31
985        | 0x0E34..=0x0E3A
986        | 0x0E47..=0x0E4E
987        | 0x0EB1
988        | 0x0EB4..=0x0EBC
989        | 0x0EC8..=0x0ECD
990        | 0x0F18..=0x0F19
991        | 0x0F35
992        | 0x0F37
993        | 0x0F39
994        | 0x0F71..=0x0F7E
995        | 0x0F80..=0x0F84
996        | 0x0F86..=0x0F87
997        | 0x0F8D..=0x0F97
998        | 0x0F99..=0x0FBC
999        | 0x0FC6
1000        | 0x102D..=0x1030
1001        | 0x1032..=0x1037
1002        | 0x1039..=0x103A
1003        | 0x103D..=0x103E
1004        | 0x1058..=0x1059
1005        | 0x105E..=0x1060
1006        | 0x1071..=0x1074
1007        | 0x1082
1008        | 0x1085..=0x1086
1009        | 0x108D
1010        | 0x109D
1011        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
1012        | 0x135D..=0x135F
1013        | 0x1712..=0x1714
1014        | 0x1732..=0x1734
1015        | 0x1752..=0x1753
1016        | 0x1772..=0x1773
1017        | 0x17B4..=0x17B5
1018        | 0x17B7..=0x17BD
1019        | 0x17C6
1020        | 0x17C9..=0x17D3
1021        | 0x17DD
1022        | 0x180B..=0x180D
1023        | 0x1885..=0x1886
1024        | 0x18A9
1025        | 0x1920..=0x1922
1026        | 0x1927..=0x1928
1027        | 0x1932
1028        | 0x1939..=0x193B
1029        | 0x1A17..=0x1A18
1030        | 0x1A1B
1031        | 0x1A56
1032        | 0x1A58..=0x1A5E
1033        | 0x1A60
1034        | 0x1A62
1035        | 0x1A65..=0x1A6C
1036        | 0x1A73..=0x1A7C
1037        | 0x1A7F
1038        | 0x1AB0..=0x1ABE
1039        | 0x1B00..=0x1B03
1040        | 0x1B34
1041        | 0x1B36..=0x1B3A
1042        | 0x1B3C
1043        | 0x1B42
1044        | 0x1B6B..=0x1B73
1045        | 0x1B80..=0x1B81
1046        | 0x1BA2..=0x1BA5
1047        | 0x1BA8..=0x1BA9
1048        | 0x1BAB..=0x1BAD
1049        | 0x1BE6
1050        | 0x1BE8..=0x1BE9
1051        | 0x1BED
1052        | 0x1BEF..=0x1BF1
1053        | 0x1C2C..=0x1C33
1054        | 0x1C36..=0x1C37
1055        | 0x1CD0..=0x1CD2
1056        | 0x1CD4..=0x1CE0
1057        | 0x1CE2..=0x1CE8
1058        | 0x1CED
1059        | 0x1CF4
1060        | 0x1CF8..=0x1CF9
1061        | 0x1DC0..=0x1DF9
1062        | 0x1DFB..=0x1DFF
1063        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
1064        | 0x202A..=0x202E // Bidi control chars
1065        | 0x2060..=0x2064 // Word joiner, invisible operators
1066        | 0x2066..=0x206F // Bidi isolates
1067        | 0x20D0..=0x20F0 // Combining marks for symbols
1068        | 0xFE00..=0xFE0F // Variation Selectors
1069        | 0xFE20..=0xFE2F // Combining Half Marks
1070        | 0xFEFF          // Zero Width No-Break Space (BOM)
1071        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
1072        | 0x1D167..=0x1D169
1073        | 0x1D173..=0x1D182
1074        | 0x1D185..=0x1D18B
1075        | 0x1D1AA..=0x1D1AD
1076        | 0x1D242..=0x1D244
1077        | 0xE0001
1078        | 0xE0020..=0xE007F
1079        | 0xE0100..=0xE01EF // Variation Selectors Supplement
1080    )
1081}
1082
1083/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
1084/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
1085#[inline]
1086fn is_wide_char(cp: u32) -> bool {
1087    matches!(
1088        cp,
1089        0x1100..=0x115F   // Hangul Jamo
1090        | 0x231A..=0x231B // Watch, Hourglass
1091        | 0x2329..=0x232A // Angle Brackets
1092        | 0x23E9..=0x23F3 // Various symbols
1093        | 0x23F8..=0x23FA
1094        | 0x25FD..=0x25FE
1095        | 0x2614..=0x2615
1096        | 0x2648..=0x2653
1097        | 0x267F
1098        | 0x2693
1099        | 0x26A1
1100        | 0x26AA..=0x26AB
1101        | 0x26BD..=0x26BE
1102        | 0x26C4..=0x26C5
1103        | 0x26CE
1104        | 0x26D4
1105        | 0x26EA
1106        | 0x26F2..=0x26F3
1107        | 0x26F5
1108        | 0x26FA
1109        | 0x26FD
1110        | 0x2702
1111        | 0x2705
1112        | 0x2708..=0x270D
1113        | 0x270F
1114        | 0x2712
1115        | 0x2714
1116        | 0x2716
1117        | 0x271D
1118        | 0x2721
1119        | 0x2728
1120        | 0x2733..=0x2734
1121        | 0x2744
1122        | 0x2747
1123        | 0x274C
1124        | 0x274E
1125        | 0x2753..=0x2755
1126        | 0x2757
1127        | 0x2763..=0x2764
1128        | 0x2795..=0x2797
1129        | 0x27A1
1130        | 0x27B0
1131        | 0x27BF
1132        | 0x2934..=0x2935
1133        | 0x2B05..=0x2B07
1134        | 0x2B1B..=0x2B1C
1135        | 0x2B50
1136        | 0x2B55
1137        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
1138        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
1139        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1140        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1141        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1142        | 0xAC00..=0xD7A3  // Hangul Syllables
1143        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1144        | 0xFE10..=0xFE19  // Vertical Forms
1145        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1146        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1147        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1148        | 0x1F004
1149        | 0x1F0CF
1150        | 0x1F170..=0x1F171
1151        | 0x1F17E..=0x1F17F
1152        | 0x1F18E
1153        | 0x1F191..=0x1F19A
1154        | 0x1F1E0..=0x1F1FF // Regional Indicators
1155        | 0x1F200..=0x1F202
1156        | 0x1F210..=0x1F23B
1157        | 0x1F240..=0x1F248
1158        | 0x1F250..=0x1F251
1159        | 0x1F260..=0x1F265
1160        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1161        | 0x1F680..=0x1F6FF // Transport Symbols
1162        | 0x1F900..=0x1F9FF // Supplemental Symbols
1163        | 0x1FA00..=0x1FA6F
1164        | 0x1FA70..=0x1FAFF
1165        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1166        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1167    )
1168}
1169
1170/// Compute maximum display width of any line (C/POSIX locale).
1171///
1172/// GNU wc -L behavior in C locale:
1173/// - `\n`: line terminator (records max, resets position)
1174/// - `\t`: advances to next tab stop (multiple of 8)
1175/// - `\r`: carriage return (resets position to 0, same line)
1176/// - `\f`: form feed (acts as line terminator like \n)
1177/// - Printable ASCII (0x20..0x7E): width 1
1178/// - Everything else (controls, high bytes): width 0
1179///
1180/// Optimized with printable ASCII run counting: for runs of bytes in
1181/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1182pub fn max_line_length_c(data: &[u8]) -> u64 {
1183    let mut max_len: u64 = 0;
1184    let mut line_len: u64 = 0;
1185    let mut linepos: u64 = 0;
1186    let mut i = 0;
1187    let len = data.len();
1188
1189    while i < len {
1190        let b = unsafe { *data.get_unchecked(i) };
1191        if b >= 0x21 && b <= 0x7E {
1192            // Printable non-space ASCII — count run length
1193            i += 1;
1194            let mut run = 1u64;
1195            while i < len {
1196                let b = unsafe { *data.get_unchecked(i) };
1197                if b >= 0x21 && b <= 0x7E {
1198                    run += 1;
1199                    i += 1;
1200                } else {
1201                    break;
1202                }
1203            }
1204            linepos += run;
1205            if linepos > line_len {
1206                line_len = linepos;
1207            }
1208        } else {
1209            match b {
1210                b' ' => {
1211                    linepos += 1;
1212                    if linepos > line_len {
1213                        line_len = linepos;
1214                    }
1215                }
1216                b'\n' => {
1217                    if line_len > max_len {
1218                        max_len = line_len;
1219                    }
1220                    linepos = 0;
1221                    line_len = 0;
1222                }
1223                b'\t' => {
1224                    linepos = (linepos + 8) & !7;
1225                    if linepos > line_len {
1226                        line_len = linepos;
1227                    }
1228                }
1229                b'\r' => {
1230                    linepos = 0;
1231                }
1232                0x0C => {
1233                    if line_len > max_len {
1234                        max_len = line_len;
1235                    }
1236                    linepos = 0;
1237                    line_len = 0;
1238                }
1239                _ => {} // Non-printable: width 0
1240            }
1241            i += 1;
1242        }
1243    }
1244
1245    if line_len > max_len {
1246        max_len = line_len;
1247    }
1248
1249    max_len
1250}
1251
1252/// Compute maximum display width of any line (UTF-8 locale).
1253///
1254/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1255/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1256///
1257/// Optimized with printable ASCII run counting for common text.
1258pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1259    let mut max_len: u64 = 0;
1260    let mut line_len: u64 = 0;
1261    let mut linepos: u64 = 0;
1262    let mut i = 0;
1263    let len = data.len();
1264
1265    while i < len {
1266        let b = unsafe { *data.get_unchecked(i) };
1267
1268        if b >= 0x21 && b <= 0x7E {
1269            // Printable non-space ASCII (most common) — count run length
1270            i += 1;
1271            let mut run = 1u64;
1272            while i < len {
1273                let b = unsafe { *data.get_unchecked(i) };
1274                if b >= 0x21 && b <= 0x7E {
1275                    run += 1;
1276                    i += 1;
1277                } else {
1278                    break;
1279                }
1280            }
1281            linepos += run;
1282            if linepos > line_len {
1283                line_len = linepos;
1284            }
1285        } else if b < 0x80 {
1286            // Other ASCII: space, tab, newline, controls
1287            match b {
1288                b' ' => {
1289                    linepos += 1;
1290                    if linepos > line_len {
1291                        line_len = linepos;
1292                    }
1293                }
1294                b'\n' => {
1295                    if line_len > max_len {
1296                        max_len = line_len;
1297                    }
1298                    linepos = 0;
1299                    line_len = 0;
1300                }
1301                b'\t' => {
1302                    linepos = (linepos + 8) & !7;
1303                    if linepos > line_len {
1304                        line_len = linepos;
1305                    }
1306                }
1307                b'\r' => {
1308                    linepos = 0;
1309                }
1310                0x0C => {
1311                    if line_len > max_len {
1312                        max_len = line_len;
1313                    }
1314                    linepos = 0;
1315                    line_len = 0;
1316                }
1317                _ => {} // Non-printable: width 0
1318            }
1319            i += 1;
1320        } else {
1321            // Multibyte UTF-8
1322            let (cp, len) = decode_utf8(&data[i..]);
1323
1324            // C1 control characters (0x80..0x9F): non-printable, width 0
1325            if cp <= 0x9F {
1326                // width 0
1327            } else if is_zero_width(cp) {
1328                // Combining marks, zero-width chars: width 0
1329            } else if is_wide_char(cp) {
1330                linepos += 2;
1331                if linepos > line_len {
1332                    line_len = linepos;
1333                }
1334            } else {
1335                // Regular printable Unicode character: width 1
1336                linepos += 1;
1337                if linepos > line_len {
1338                    line_len = linepos;
1339                }
1340            }
1341            i += len;
1342        }
1343    }
1344
1345    // Handle last line
1346    if line_len > max_len {
1347        max_len = line_len;
1348    }
1349
1350    max_len
1351}
1352
1353/// Compute maximum display width, choosing behavior based on locale.
1354#[inline]
1355pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1356    if utf8 {
1357        max_line_length_utf8(data)
1358    } else {
1359        max_line_length_c(data)
1360    }
1361}
1362
1363/// Count all metrics using optimized individual passes.
1364///
1365/// Each metric uses its own optimized algorithm:
1366/// - Lines: SIMD-accelerated memchr
1367/// - Words: 3-state scalar/state-machine (locale-dependent)
1368/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1369/// - Max line length: locale-aware display width tracking
1370///
1371/// Multi-pass is faster than single-pass because each pass has a tight,
1372/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1373/// making subsequent passes nearly free for memory bandwidth.
1374pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1375    if utf8 {
1376        let (lines, words) = count_lines_words_utf8_fused(data);
1377        WcCounts {
1378            lines,
1379            words,
1380            bytes: data.len() as u64,
1381            chars: count_chars_utf8(data),
1382            max_line_length: max_line_length_utf8(data),
1383        }
1384    } else {
1385        WcCounts {
1386            lines: count_lines(data),
1387            words: count_words_locale(data, false),
1388            bytes: data.len() as u64,
1389            chars: data.len() as u64,
1390            max_line_length: max_line_length_c(data),
1391        }
1392    }
1393}
1394
1395/// Quick check if data is likely all-ASCII by sampling three regions.
1396/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1397/// If any byte >= 0x80 is found, returns false.
1398#[inline]
1399fn check_ascii_sample(data: &[u8]) -> bool {
1400    let len = data.len();
1401    if len == 0 {
1402        return true;
1403    }
1404
1405    // Check in 8-byte blocks using OR-accumulation for speed
1406    let check_region = |start: usize, end: usize| -> bool {
1407        let mut or_acc = 0u8;
1408        let region = &data[start..end];
1409        let mut i = 0;
1410        while i + 8 <= region.len() {
1411            unsafe {
1412                or_acc |= *region.get_unchecked(i);
1413                or_acc |= *region.get_unchecked(i + 1);
1414                or_acc |= *region.get_unchecked(i + 2);
1415                or_acc |= *region.get_unchecked(i + 3);
1416                or_acc |= *region.get_unchecked(i + 4);
1417                or_acc |= *region.get_unchecked(i + 5);
1418                or_acc |= *region.get_unchecked(i + 6);
1419                or_acc |= *region.get_unchecked(i + 7);
1420            }
1421            i += 8;
1422        }
1423        while i < region.len() {
1424            or_acc |= region[i];
1425            i += 1;
1426        }
1427        or_acc < 0x80
1428    };
1429
1430    let sample = 256.min(len);
1431
1432    // Check beginning
1433    if !check_region(0, sample) {
1434        return false;
1435    }
1436    // Check middle
1437    if len > sample * 2 {
1438        let mid = len / 2;
1439        let mid_start = mid.saturating_sub(sample / 2);
1440        if !check_region(mid_start, (mid_start + sample).min(len)) {
1441            return false;
1442        }
1443    }
1444    // Check end
1445    if len > sample {
1446        if !check_region(len - sample, len) {
1447            return false;
1448        }
1449    }
1450
1451    true
1452}
1453
1454// ──────────────────────────────────────────────────
1455// Parallel counting for large files
1456// ──────────────────────────────────────────────────
1457
1458/// Split data into chunks at newline boundaries for parallel processing.
1459/// Returns slices where each slice (except possibly the last) ends with `\n`.
1460/// Splitting at newlines guarantees word boundaries in any locale,
1461/// enabling safe parallel word counting without boundary adjustment.
1462fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1463    if data.is_empty() || num_chunks <= 1 {
1464        return vec![data];
1465    }
1466    let chunk_size = data.len() / num_chunks;
1467    let mut chunks = Vec::with_capacity(num_chunks);
1468    let mut pos = 0;
1469
1470    for _ in 0..num_chunks - 1 {
1471        let target = pos + chunk_size;
1472        if target >= data.len() {
1473            break;
1474        }
1475        let boundary = memchr::memchr(b'\n', &data[target..])
1476            .map(|p| target + p + 1)
1477            .unwrap_or(data.len());
1478        if boundary > pos {
1479            chunks.push(&data[pos..boundary]);
1480        }
1481        pos = boundary;
1482    }
1483    if pos < data.len() {
1484        chunks.push(&data[pos..]);
1485    }
1486    chunks
1487}
1488
1489/// Count newlines in parallel using SIMD memchr + rayon.
1490/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1491pub fn count_lines_parallel(data: &[u8]) -> u64 {
1492    if data.len() < PARALLEL_THRESHOLD {
1493        return count_lines(data);
1494    }
1495
1496    let num_threads = rayon::current_num_threads().max(1);
1497    // Ensure chunks are large enough to amortize SIMD setup overhead
1498    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1499
1500    data.par_chunks(chunk_size)
1501        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1502        .sum()
1503}
1504
1505/// Count words in parallel with boundary adjustment.
1506pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1507    if data.len() < PARALLEL_THRESHOLD {
1508        return count_words_locale(data, utf8);
1509    }
1510
1511    let num_threads = rayon::current_num_threads().max(1);
1512
1513    if utf8 {
1514        // UTF-8: split at newline boundaries for safe parallel word counting.
1515        // Newlines are always word boundaries, so no boundary adjustment needed.
1516        let chunks = split_at_newlines(data, num_threads);
1517        chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1518    } else {
1519        // C locale: parallel 3-state word counting with boundary adjustment
1520        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1521
1522        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1523
1524        // Each chunk returns (lines, word_count, first_active_is_printable, ends_in_word)
1525        let results: Vec<(u64, u64, bool, bool)> = chunks
1526            .par_iter()
1527            .map(|chunk| count_lw_c_chunk(chunk))
1528            .collect();
1529
1530        let mut total = 0u64;
1531        for i in 0..results.len() {
1532            total += results[i].1;
1533            // Boundary adjustment: if previous chunk ended in_word AND
1534            // current chunk's first non-transparent byte is printable,
1535            // the word was split across chunks — subtract the overcount.
1536            if i > 0 && results[i - 1].3 && results[i].2 {
1537                total -= 1;
1538            }
1539        }
1540        total
1541    }
1542}
1543
1544/// Count UTF-8 characters in parallel.
1545pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1546    if !utf8 {
1547        return data.len() as u64;
1548    }
1549    if data.len() < PARALLEL_THRESHOLD {
1550        return count_chars_utf8(data);
1551    }
1552
1553    let num_threads = rayon::current_num_threads().max(1);
1554    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1555
1556    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1557}
1558
1559/// Count lines + words + bytes in a single fused pass (the default wc mode).
1560/// Avoids separate passes entirely — combines newline counting with word detection.
1561pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1562    let (lines, words) = count_lines_words(data, utf8);
1563    (lines, words, data.len() as u64)
1564}
1565
1566/// Parallel counting of lines + words + bytes only (no chars).
1567/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1568/// C locale: single fused pass per chunk counts BOTH lines and words.
1569/// UTF-8: checks ASCII first for C locale fast path, else splits at newlines
1570/// for safe parallel UTF-8 word counting.
1571pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1572    if data.len() < PARALLEL_THRESHOLD {
1573        // Small file: use fused single-pass
1574        return count_lwb(data, utf8);
1575    }
1576
1577    let num_threads = rayon::current_num_threads().max(1);
1578
1579    let (lines, words) = if !utf8 {
1580        // C locale: FUSED parallel lines+words counting — single pass per chunk
1581        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1582
1583        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1584        let results: Vec<(u64, u64, bool, bool)> = chunks
1585            .par_iter()
1586            .map(|chunk| count_lw_c_chunk_fast(chunk))
1587            .collect();
1588
1589        let mut line_total = 0u64;
1590        let mut word_total = 0u64;
1591        for i in 0..results.len() {
1592            line_total += results[i].0;
1593            word_total += results[i].1;
1594            if i > 0 && results[i - 1].3 && results[i].2 {
1595                word_total -= 1;
1596            }
1597        }
1598
1599        (line_total, word_total)
1600    } else {
1601        // UTF-8 locale: check if ASCII for faster C locale path
1602        let is_ascii = check_ascii_sample(data);
1603        if is_ascii {
1604            // Pure ASCII: use C locale parallel path (arbitrary chunks OK)
1605            let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1606            let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1607            let results: Vec<(u64, u64, bool, bool)> = chunks
1608                .par_iter()
1609                .map(|chunk| count_lw_c_chunk_fast(chunk))
1610                .collect();
1611
1612            let mut line_total = 0u64;
1613            let mut word_total = 0u64;
1614            for i in 0..results.len() {
1615                line_total += results[i].0;
1616                word_total += results[i].1;
1617                if i > 0 && results[i - 1].3 && results[i].2 {
1618                    word_total -= 1;
1619                }
1620            }
1621            (line_total, word_total)
1622        } else {
1623            // Non-ASCII UTF-8: split at newline boundaries for safe parallel
1624            // word counting. Newlines always break words, so no adjustment needed.
1625            let chunks = split_at_newlines(data, num_threads);
1626            let results: Vec<(u64, u64)> = chunks
1627                .par_iter()
1628                .map(|chunk| count_lines_words_utf8_fused(chunk))
1629                .collect();
1630            let mut line_total = 0u64;
1631            let mut word_total = 0u64;
1632            for (l, w) in results {
1633                line_total += l;
1634                word_total += w;
1635            }
1636            (line_total, word_total)
1637        }
1638    };
1639
1640    (lines, words, data.len() as u64)
1641}
1642
1643/// Combined parallel counting of lines + words + chars.
1644/// UTF-8: splits at newline boundaries for fused lines+words+chars per chunk.
1645/// C locale: fused parallel lines+words with boundary adjustment + parallel chars.
1646pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1647    if data.len() < PARALLEL_THRESHOLD {
1648        let lines = count_lines(data);
1649        let words = count_words_locale(data, utf8);
1650        let chars = count_chars(data, utf8);
1651        return (lines, words, chars);
1652    }
1653
1654    let num_threads = rayon::current_num_threads().max(1);
1655
1656    if utf8 {
1657        // UTF-8: fused parallel lines+words+chars per chunk (split at newlines)
1658        let chunks = split_at_newlines(data, num_threads);
1659        let results: Vec<(u64, u64, u64)> = chunks
1660            .par_iter()
1661            .map(|chunk| {
1662                let (lines, words) = count_lines_words_utf8_fused(chunk);
1663                let chars = count_chars_utf8(chunk);
1664                (lines, words, chars)
1665            })
1666            .collect();
1667        let mut lines = 0u64;
1668        let mut words = 0u64;
1669        let mut chars = 0u64;
1670        for (l, w, c) in results {
1671            lines += l;
1672            words += w;
1673            chars += c;
1674        }
1675        (lines, words, chars)
1676    } else {
1677        // C locale: fused parallel lines+words + parallel chars (= byte count)
1678        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1679        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1680        let results: Vec<(u64, u64, bool, bool)> = chunks
1681            .par_iter()
1682            .map(|chunk| count_lw_c_chunk_fast(chunk))
1683            .collect();
1684        let mut lines = 0u64;
1685        let mut words = 0u64;
1686        for i in 0..results.len() {
1687            lines += results[i].0;
1688            words += results[i].1;
1689            if i > 0 && results[i - 1].3 && results[i].2 {
1690                words -= 1;
1691            }
1692        }
1693        (lines, words, data.len() as u64)
1694    }
1695}
1696
1697/// Parallel max line length computation.
1698/// Splits at newline boundaries so each chunk independently computes correct
1699/// max line width (since newlines reset position tracking).
1700pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1701    if data.len() < PARALLEL_THRESHOLD {
1702        return max_line_length(data, utf8);
1703    }
1704    let num_threads = rayon::current_num_threads().max(1);
1705    let chunks = split_at_newlines(data, num_threads);
1706    chunks
1707        .par_iter()
1708        .map(|chunk| {
1709            if utf8 {
1710                max_line_length_utf8(chunk)
1711            } else {
1712                max_line_length_c(chunk)
1713            }
1714        })
1715        .max()
1716        .unwrap_or(0)
1717}
1718
1719/// Parallel counting of all metrics at once.
1720/// Splits at newline boundaries for safe parallel word + max_line_length counting.
1721/// Each chunk computes all metrics in a single traversal group, maximizing cache reuse.
1722pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1723    if data.len() < PARALLEL_THRESHOLD {
1724        return count_all(data, utf8);
1725    }
1726
1727    let num_threads = rayon::current_num_threads().max(1);
1728    let chunks = split_at_newlines(data, num_threads);
1729
1730    if utf8 {
1731        let results: Vec<(u64, u64, u64, u64)> = chunks
1732            .par_iter()
1733            .map(|chunk| {
1734                let (lines, words) = count_lines_words_utf8_fused(chunk);
1735                let chars = count_chars_utf8(chunk);
1736                let max_ll = max_line_length_utf8(chunk);
1737                (lines, words, chars, max_ll)
1738            })
1739            .collect();
1740
1741        let mut counts = WcCounts {
1742            bytes: data.len() as u64,
1743            ..Default::default()
1744        };
1745        for (l, w, c, m) in results {
1746            counts.lines += l;
1747            counts.words += w;
1748            counts.chars += c;
1749            if m > counts.max_line_length {
1750                counts.max_line_length = m;
1751            }
1752        }
1753        counts
1754    } else {
1755        // C locale: fused lines+words per chunk + max_line_length per chunk
1756        let results: Vec<(u64, u64, u64)> = chunks
1757            .par_iter()
1758            .map(|chunk| {
1759                let (lines, words) = count_lines_words(chunk, false);
1760                let max_ll = max_line_length_c(chunk);
1761                (lines, words, max_ll)
1762            })
1763            .collect();
1764
1765        let mut counts = WcCounts {
1766            bytes: data.len() as u64,
1767            chars: data.len() as u64,
1768            ..Default::default()
1769        };
1770        for (l, w, m) in &results {
1771            counts.lines += l;
1772            counts.words += w;
1773            if *m > counts.max_line_length {
1774                counts.max_line_length = *m;
1775            }
1776        }
1777        counts
1778    }
1779}