Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (1MB).
5/// Rayon overhead is ~5-10μs per task; at 1MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~100μs, so overhead is < 10%.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// 3-state byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// GNU wc uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
24//   0 = printable (word content): starts or continues a word
25//   1 = space (word break): ends any current word
26//   2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
27//
28// The critical difference from 2-state is that transparent characters
29// (NUL, control chars, invalid UTF-8) do NOT break words.
30// Example: "hello\x00world" is 1 word (NUL is transparent), not 2.
31
32/// Byte classification for C/POSIX locale word counting.
33/// GNU wc in C locale uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
34///   0 = printable (word content): starts or continues a word
35///   1 = space (word break): ends any current word
36///   2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
37///
38/// In C locale, mbrtowc returns each byte as-is, then:
39///   - iswspace: 0x09-0x0D, 0x20 -> true (space)
40///   - iswprint: 0x20-0x7E -> true (printable); note: 0x20 is both space and printable,
41///     but space takes priority in GNU wc's logic
42///   - Bytes 0x00-0x08, 0x0E-0x1F, 0x7F, 0x80-0xFF: neither space nor printable -> transparent
43///
44/// This matches the UTF-8 byte table for single-byte classification.
45const BYTE_CLASS_C: [u8; 256] = BYTE_CLASS_UTF8;
46
47/// 3-state single-byte classification for UTF-8 locale.
48/// Multi-byte UTF-8 sequences are handled by the state machine separately.
49const fn make_byte_class_utf8() -> [u8; 256] {
50    let mut t = [2u8; 256]; // default: transparent
51    // Spaces
52    t[0x09] = 1; // \t
53    t[0x0A] = 1; // \n
54    t[0x0B] = 1; // \v
55    t[0x0C] = 1; // \f
56    t[0x0D] = 1; // \r
57    t[0x20] = 1; // space
58    // Printable ASCII (0x21-0x7E): word content
59    let mut i = 0x21u16;
60    while i <= 0x7E {
61        t[i as usize] = 0;
62        i += 1;
63    }
64    t
65}
66
67const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
68
69// ──────────────────────────────────────────────────
70// Unicode character classification helpers
71// ──────────────────────────────────────────────────
72
73/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
74/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
75#[inline]
76fn is_unicode_space(cp: u32) -> bool {
77    matches!(
78        cp,
79        0x00A0 |           // No-Break Space
80        0x1680 |           // Ogham Space Mark
81        0x2000
82            ..=0x200A |  // En Quad through Hair Space
83        0x2028 |           // Line Separator
84        0x2029 |           // Paragraph Separator
85        0x202F |           // Narrow No-Break Space
86        0x205F |           // Medium Mathematical Space
87        0x3000 // Ideographic Space
88    )
89}
90
91/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
92/// C1 control characters (U+0080-U+009F) are not printable.
93/// Most characters >= U+00A0 are printable.
94#[inline]
95fn is_unicode_printable(cp: u32) -> bool {
96    cp >= 0xA0
97}
98
99// ──────────────────────────────────────────────────
100// Core counting functions
101// ──────────────────────────────────────────────────
102
103/// Count newlines using SIMD-accelerated memchr.
104/// GNU wc counts newline bytes (`\n`), not logical lines.
105#[inline]
106pub fn count_lines(data: &[u8]) -> u64 {
107    memchr_iter(b'\n', data).count() as u64
108}
109
110/// Count bytes. Trivial but included for API consistency.
111#[inline]
112pub fn count_bytes(data: &[u8]) -> u64 {
113    data.len() as u64
114}
115
116/// Count words using locale-aware 3-state logic (default: UTF-8).
117pub fn count_words(data: &[u8]) -> u64 {
118    count_words_locale(data, true)
119}
120
121/// Count words with explicit locale control using 3-state logic.
122///
123/// GNU wc classifies each character as:
124///   - space (iswspace=true): sets in_word=false
125///   - printable (iswprint=true): sets in_word=true, increments word count on transition
126///   - transparent (neither): leaves in_word unchanged
127pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
128    if utf8 {
129        count_words_utf8(data)
130    } else {
131        count_words_c(data)
132    }
133}
134
135/// Count words in C/POSIX locale using 3-state logic matching GNU wc.
136/// GNU wc in C locale uses mbrtowc() + iswspace() + iswprint():
137///   - class 0 (printable: 0x20-0x7E minus spaces, i.e. 0x21-0x7E): word content
138///   - class 1 (space: 0x09-0x0D, 0x20): word break
139///   - class 2 (transparent: everything else): leaves in_word unchanged
140///
141/// This means bytes >= 0x80 and control characters do NOT start or break words.
142///
143/// Optimized with printable ASCII run skipping.
144fn count_words_c(data: &[u8]) -> u64 {
145    let mut words = 0u64;
146    let mut in_word = false;
147    let mut i = 0;
148    let len = data.len();
149
150    while i < len {
151        let b = unsafe { *data.get_unchecked(i) };
152        if b >= 0x21 && b <= 0x7E {
153            // Printable ASCII — word content (most common for text)
154            if !in_word {
155                in_word = true;
156                words += 1;
157            }
158            i += 1;
159            // Skip remaining printable ASCII
160            while i < len {
161                let b = unsafe { *data.get_unchecked(i) };
162                if b >= 0x21 && b <= 0x7E {
163                    i += 1;
164                } else {
165                    break;
166                }
167            }
168        } else {
169            let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
170            if class == 1 {
171                // Space — break word
172                in_word = false;
173            }
174            // class == 2 (transparent): leave in_word unchanged
175            i += 1;
176        }
177    }
178    words
179}
180
181/// AVX2-accelerated fused line+word counter for C locale chunks.
182/// Processes 32 bytes per iteration using 3-state logic:
183///   - Printable ASCII (0x21-0x7E): word content (starts/continues words)
184///   - Space bytes (0x09-0x0D, 0x20): word breaks
185///   - Everything else: transparent (does NOT affect word state)
186/// Word transitions are detected via bitmask of printable positions.
187/// GNU wc C locale: only printable ASCII starts words; high bytes are transparent.
188#[cfg(target_arch = "x86_64")]
189#[target_feature(enable = "avx2")]
190unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
191    use std::arch::x86_64::*;
192
193    let len = data.len();
194    let ptr = data.as_ptr();
195    let mut i = 0usize;
196    let mut total_lines = 0u64;
197    let mut total_words = 0u64;
198    // 3-state: 0 = neutral (not in word), 1 = in word, 2 = transparent (no change)
199    // We track prev_state as an enum: 0 = not in word, 1 = in word
200    let mut prev_in_word = false;
201
202    unsafe {
203        let nl_byte = _mm256_set1_epi8(b'\n' as i8);
204        let zero = _mm256_setzero_si256();
205        let ones = _mm256_set1_epi8(1);
206        // Printable ASCII: 0x21-0x7E  (word content)
207        let lo_print = _mm256_set1_epi8(0x20i8); // 0x21 - 1
208        let hi_print = _mm256_set1_epi8(0x7Fi8); // 0x7E + 1
209        // Space detection: 0x09-0x0D and 0x20
210        let space_char = _mm256_set1_epi8(0x20i8);
211        let tab_lo = _mm256_set1_epi8(0x08i8);
212        let tab_hi = _mm256_set1_epi8(0x0Ei8);
213
214        let mut line_acc = _mm256_setzero_si256();
215        let mut batch = 0u32;
216
217        while i + 32 <= len {
218            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
219            let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
220            line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
221
222            // is_printable = (v > 0x20) && (v < 0x7F) — printable ASCII range
223            let gt_20 = _mm256_cmpgt_epi8(v, lo_print);
224            let lt_7f = _mm256_cmpgt_epi8(hi_print, v);
225            let is_printable = _mm256_and_si256(gt_20, lt_7f);
226
227            // is_space = (v == 0x20) | (v > 0x08 && v < 0x0E)
228            let is_sp = _mm256_cmpeq_epi8(v, space_char);
229            let gt_08 = _mm256_cmpgt_epi8(v, tab_lo);
230            let lt_0e = _mm256_cmpgt_epi8(tab_hi, v);
231            let is_tab_range = _mm256_and_si256(gt_08, lt_0e);
232            let is_space = _mm256_or_si256(is_sp, is_tab_range);
233
234            let print_mask = _mm256_movemask_epi8(is_printable) as u32;
235            let space_mask = _mm256_movemask_epi8(is_space) as u32;
236
237            // Fast path: if all bytes are printable or space (no transparent bytes),
238            // use the 2-state bitmask approach. This handles pure ASCII text.
239            if (print_mask | space_mask) == 0xFFFF_FFFF {
240                // 2-state: printable = word content, space = break
241                let prev_mask = (print_mask << 1) | (prev_in_word as u32);
242                total_words += (print_mask & !prev_mask).count_ones() as u64;
243                prev_in_word = (print_mask >> 31) & 1 == 1;
244            } else {
245                // 3-state: bit-by-bit propagation for transparent bytes
246                let mut local_words = 0u32;
247                let mut in_word = prev_in_word;
248                let mut pm = print_mask;
249                let mut sm = space_mask;
250                for _ in 0..32 {
251                    let p = pm & 1;
252                    let s = sm & 1;
253                    if s != 0 {
254                        in_word = false;
255                    } else if p != 0 {
256                        if !in_word {
257                            local_words += 1;
258                            in_word = true;
259                        }
260                    }
261                    pm >>= 1;
262                    sm >>= 1;
263                }
264                total_words += local_words as u64;
265                prev_in_word = in_word;
266            }
267
268            batch += 1;
269            if batch >= 255 {
270                let sad = _mm256_sad_epu8(line_acc, zero);
271                let hi = _mm256_extracti128_si256(sad, 1);
272                let lo = _mm256_castsi256_si128(sad);
273                let s = _mm_add_epi64(lo, hi);
274                let h64 = _mm_unpackhi_epi64(s, s);
275                let t = _mm_add_epi64(s, h64);
276                total_lines += _mm_cvtsi128_si64(t) as u64;
277                line_acc = _mm256_setzero_si256();
278                batch = 0;
279            }
280            i += 32;
281        }
282
283        if batch > 0 {
284            let sad = _mm256_sad_epu8(line_acc, zero);
285            let hi = _mm256_extracti128_si256(sad, 1);
286            let lo = _mm256_castsi256_si128(sad);
287            let s = _mm_add_epi64(lo, hi);
288            let h64 = _mm_unpackhi_epi64(s, s);
289            let t = _mm_add_epi64(s, h64);
290            total_lines += _mm_cvtsi128_si64(t) as u64;
291        }
292
293        // Scalar tail using 3-state logic
294        while i < len {
295            let b = *ptr.add(i);
296            if b == b'\n' {
297                total_lines += 1;
298                prev_in_word = false;
299            } else if b >= 0x21 && b <= 0x7E {
300                // Printable ASCII — word content
301                if !prev_in_word {
302                    total_words += 1;
303                    prev_in_word = true;
304                }
305            } else if *BYTE_CLASS_C.get_unchecked(b as usize) == 1 {
306                // Other space byte
307                prev_in_word = false;
308            }
309            // else: transparent, leave prev_in_word unchanged
310            i += 1;
311        }
312    }
313
314    let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] == 0;
315    (total_lines, total_words, first_is_word, prev_in_word)
316}
317
318/// SSE2-accelerated fused line+word counter for C locale chunks.
319/// Same 3-state algorithm as AVX2 but processes 16 bytes per iteration.
320/// Available on all x86_64 CPUs (SSE2 is baseline for x86_64).
321/// GNU wc C locale: only printable ASCII (0x21-0x7E) starts words.
322#[cfg(target_arch = "x86_64")]
323#[target_feature(enable = "sse2")]
324unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
325    use std::arch::x86_64::*;
326
327    let len = data.len();
328    let ptr = data.as_ptr();
329    let mut i = 0usize;
330    let mut total_lines = 0u64;
331    let mut total_words = 0u64;
332    let mut prev_in_word = false;
333
334    unsafe {
335        let nl_byte = _mm_set1_epi8(b'\n' as i8);
336        let zero = _mm_setzero_si128();
337        let ones = _mm_set1_epi8(1);
338        // Printable ASCII: 0x21-0x7E
339        let lo_print = _mm_set1_epi8(0x20i8);
340        let hi_print = _mm_set1_epi8(0x7Fi8);
341        // Space detection: 0x09-0x0D and 0x20
342        let space_char = _mm_set1_epi8(0x20i8);
343        let tab_lo = _mm_set1_epi8(0x08i8);
344        let tab_hi = _mm_set1_epi8(0x0Ei8);
345
346        let mut line_acc = _mm_setzero_si128();
347        let mut batch = 0u32;
348
349        while i + 16 <= len {
350            let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
351            let is_nl = _mm_cmpeq_epi8(v, nl_byte);
352            line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
353
354            // is_printable = (v > 0x20) && (v < 0x7F)
355            let gt_20 = _mm_cmpgt_epi8(v, lo_print);
356            let lt_7f = _mm_cmpgt_epi8(hi_print, v);
357            let is_printable = _mm_and_si128(gt_20, lt_7f);
358
359            // is_space = (v == 0x20) | (v > 0x08 && v < 0x0E)
360            let is_sp = _mm_cmpeq_epi8(v, space_char);
361            let gt_08 = _mm_cmpgt_epi8(v, tab_lo);
362            let lt_0e = _mm_cmpgt_epi8(tab_hi, v);
363            let is_tab_range = _mm_and_si128(gt_08, lt_0e);
364            let is_space = _mm_or_si128(is_sp, is_tab_range);
365
366            let print_mask = _mm_movemask_epi8(is_printable) as u32;
367            let space_mask = _mm_movemask_epi8(is_space) as u32;
368
369            // Fast path: if all bytes are printable or space (no transparent bytes),
370            // use the 2-state bitmask approach for pure ASCII text.
371            if (print_mask | space_mask) == 0xFFFF {
372                let prev_mask = (print_mask << 1) | (prev_in_word as u32);
373                total_words += (print_mask & !prev_mask).count_ones() as u64;
374                prev_in_word = (print_mask >> 15) & 1 == 1;
375            } else {
376                // 3-state: bit-by-bit propagation for transparent bytes
377                let mut local_words = 0u32;
378                let mut in_word = prev_in_word;
379                let mut pm = print_mask;
380                let mut sm = space_mask;
381                for _ in 0..16 {
382                    let p = pm & 1;
383                    let s = sm & 1;
384                    if s != 0 {
385                        in_word = false;
386                    } else if p != 0 {
387                        if !in_word {
388                            local_words += 1;
389                            in_word = true;
390                        }
391                    }
392                    pm >>= 1;
393                    sm >>= 1;
394                }
395                total_words += local_words as u64;
396                prev_in_word = in_word;
397            }
398
399            batch += 1;
400            if batch >= 255 {
401                let sad = _mm_sad_epu8(line_acc, zero);
402                let hi = _mm_unpackhi_epi64(sad, sad);
403                let t = _mm_add_epi64(sad, hi);
404                total_lines += _mm_cvtsi128_si64(t) as u64;
405                line_acc = _mm_setzero_si128();
406                batch = 0;
407            }
408            i += 16;
409        }
410
411        if batch > 0 {
412            let sad = _mm_sad_epu8(line_acc, zero);
413            let hi = _mm_unpackhi_epi64(sad, sad);
414            let t = _mm_add_epi64(sad, hi);
415            total_lines += _mm_cvtsi128_si64(t) as u64;
416        }
417
418        // Scalar tail using 3-state logic
419        while i < len {
420            let b = *ptr.add(i);
421            if b == b'\n' {
422                total_lines += 1;
423                prev_in_word = false;
424            } else if b >= 0x21 && b <= 0x7E {
425                if !prev_in_word {
426                    total_words += 1;
427                    prev_in_word = true;
428                }
429            } else if *BYTE_CLASS_C.get_unchecked(b as usize) == 1 {
430                prev_in_word = false;
431            }
432            i += 1;
433        }
434    }
435
436    let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] == 0;
437    (total_lines, total_words, first_is_word, prev_in_word)
438}
439
440/// Dispatch to AVX2, SSE2, or scalar chunk counter.
441#[inline]
442fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
443    #[cfg(target_arch = "x86_64")]
444    {
445        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
446            return unsafe { count_lw_c_chunk_avx2(data) };
447        }
448        if data.len() >= 32 {
449            return unsafe { count_lw_c_chunk_sse2(data) };
450        }
451    }
452    count_lw_c_chunk(data)
453}
454
455/// Count words + lines in a C locale chunk using 3-state logic, returning
456/// counts plus boundary info for parallel chunk merging.
457/// Returns (line_count, word_count, first_is_printable, ends_in_word).
458/// GNU wc C locale: only printable ASCII (0x21-0x7E) starts words;
459/// bytes >= 0x80 and control chars are transparent.
460fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
461    let mut lines = 0u64;
462    let mut words = 0u64;
463    let mut in_word = false;
464    let mut first_is_printable = false;
465    let mut seen_first_non_transparent = false;
466    let mut i = 0;
467    let len = data.len();
468
469    while i < len {
470        let b = unsafe { *data.get_unchecked(i) };
471        if b >= 0x21 && b <= 0x7E {
472            // Printable ASCII — word content
473            if !seen_first_non_transparent {
474                seen_first_non_transparent = true;
475                first_is_printable = true;
476            }
477            if !in_word {
478                in_word = true;
479                words += 1;
480            }
481            i += 1;
482            // Skip remaining printable ASCII
483            while i < len {
484                let b = unsafe { *data.get_unchecked(i) };
485                if b >= 0x21 && b <= 0x7E {
486                    i += 1;
487                } else {
488                    break;
489                }
490            }
491        } else {
492            let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
493            if class == 1 {
494                // Space byte — break word
495                if !seen_first_non_transparent {
496                    seen_first_non_transparent = true;
497                    // first_is_printable stays false
498                }
499                if b == b'\n' {
500                    lines += 1;
501                }
502                in_word = false;
503            }
504            // class == 2: transparent — don't change in_word or seen_first
505            i += 1;
506        }
507    }
508    (lines, words, first_is_printable, in_word)
509}
510
511/// Count words in UTF-8 locale using a state machine with 3-state logic.
512///
513/// Handles:
514/// - ASCII spaces (0x09-0x0D, 0x20): word break
515/// - ASCII printable (0x21-0x7E): word content
516/// - ASCII non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
517/// - Valid UTF-8 multi-byte → check Unicode space/printable
518/// - Invalid UTF-8: transparent (GNU wc skips invalid bytes without changing state)
519///
520/// Optimized with ASCII run skipping: when a word starts, skips remaining
521/// printable ASCII bytes without per-byte table lookups (~4x fewer state checks
522/// for English text with 5-char average word length).
523fn count_words_utf8(data: &[u8]) -> u64 {
524    let mut words = 0u64;
525    let mut in_word = false;
526    let mut i = 0;
527    let len = data.len();
528
529    while i < len {
530        let b = unsafe { *data.get_unchecked(i) };
531
532        if b >= 0x21 && b <= 0x7E {
533            // Printable ASCII (most common case for text) — word content
534            if !in_word {
535                in_word = true;
536                words += 1;
537            }
538            i += 1;
539            // Skip remaining printable ASCII (they don't change state)
540            while i < len {
541                let b = unsafe { *data.get_unchecked(i) };
542                if b >= 0x21 && b <= 0x7E {
543                    i += 1;
544                } else {
545                    break;
546                }
547            }
548        } else if b < 0x80 {
549            // Non-printable ASCII: space/tab/newline/controls
550            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
551            if class == 1 {
552                in_word = false;
553            }
554            // class == 2: transparent (controls 0x00-0x08, 0x0E-0x1F, 0x7F)
555            i += 1;
556        } else if b < 0xC2 {
557            i += 1;
558        } else if b < 0xE0 {
559            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
560                let cp = ((b as u32 & 0x1F) << 6)
561                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
562                if is_unicode_space(cp) {
563                    in_word = false;
564                } else if is_unicode_printable(cp) {
565                    if !in_word {
566                        in_word = true;
567                        words += 1;
568                    }
569                }
570                i += 2;
571            } else {
572                i += 1;
573            }
574        } else if b < 0xF0 {
575            if i + 2 < len
576                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
577                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
578            {
579                let cp = ((b as u32 & 0x0F) << 12)
580                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
581                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
582                if is_unicode_space(cp) {
583                    in_word = false;
584                } else if is_unicode_printable(cp) {
585                    if !in_word {
586                        in_word = true;
587                        words += 1;
588                    }
589                }
590                i += 3;
591            } else {
592                i += 1;
593            }
594        } else if b < 0xF5 {
595            if i + 3 < len
596                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
597                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
598                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
599            {
600                let cp = ((b as u32 & 0x07) << 18)
601                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
602                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
603                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
604                if is_unicode_space(cp) {
605                    in_word = false;
606                } else if is_unicode_printable(cp) {
607                    if !in_word {
608                        in_word = true;
609                        words += 1;
610                    }
611                }
612                i += 4;
613            } else {
614                i += 1;
615            }
616        } else {
617            i += 1;
618        }
619    }
620
621    words
622}
623
624/// Count lines and words using optimized strategies per locale.
625/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
626/// C locale: AVX2 SIMD fused counter when available, scalar fallback otherwise.
627pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
628    if utf8 {
629        count_lines_words_utf8_fused(data)
630    } else {
631        let (lines, words, _, _) = count_lw_c_chunk_fast(data);
632        (lines, words)
633    }
634}
635
636/// Fused lines+words counting in UTF-8 mode (single pass).
637/// Avoids separate memchr pass for newlines by counting them inline with words.
638///
639/// Key optimization: ASCII run skipping. Once a word starts (printable ASCII byte),
640/// we skip remaining printable ASCII bytes without any per-byte state checks.
641/// For English text (avg word ~5 chars), this reduces state transitions by ~4x.
642fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
643    let mut lines = 0u64;
644    let mut words = 0u64;
645    let mut in_word = false;
646    let mut i = 0;
647    let len = data.len();
648
649    while i < len {
650        let b = unsafe { *data.get_unchecked(i) };
651
652        if b >= 0x21 && b <= 0x7E {
653            // Printable ASCII (most common) — word content
654            if !in_word {
655                in_word = true;
656                words += 1;
657            }
658            i += 1;
659            // Skip remaining printable ASCII (they don't change state or count lines)
660            while i < len {
661                let b = unsafe { *data.get_unchecked(i) };
662                if b >= 0x21 && b <= 0x7E {
663                    i += 1;
664                } else {
665                    break;
666                }
667            }
668        } else if b == b'\n' {
669            lines += 1;
670            in_word = false;
671            i += 1;
672        } else if b == b' ' {
673            in_word = false;
674            i += 1;
675        } else if b < 0x80 {
676            // Other ASCII: \t, \r, \v, \f, controls
677            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
678            if class == 1 {
679                in_word = false;
680            }
681            // class == 2: transparent
682            i += 1;
683        } else if b < 0xC2 {
684            i += 1;
685        } else if b < 0xE0 {
686            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
687                let cp = ((b as u32 & 0x1F) << 6)
688                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
689                if is_unicode_space(cp) {
690                    in_word = false;
691                } else if is_unicode_printable(cp) {
692                    if !in_word {
693                        in_word = true;
694                        words += 1;
695                    }
696                }
697                i += 2;
698            } else {
699                i += 1;
700            }
701        } else if b < 0xF0 {
702            if i + 2 < len
703                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
704                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
705            {
706                let cp = ((b as u32 & 0x0F) << 12)
707                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
708                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
709                if is_unicode_space(cp) {
710                    in_word = false;
711                } else if is_unicode_printable(cp) {
712                    if !in_word {
713                        in_word = true;
714                        words += 1;
715                    }
716                }
717                i += 3;
718            } else {
719                i += 1;
720            }
721        } else if b < 0xF5 {
722            if i + 3 < len
723                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
724                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
725                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
726            {
727                let cp = ((b as u32 & 0x07) << 18)
728                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
729                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
730                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
731                if is_unicode_space(cp) {
732                    in_word = false;
733                } else if is_unicode_printable(cp) {
734                    if !in_word {
735                        in_word = true;
736                        words += 1;
737                    }
738                }
739                i += 4;
740            } else {
741                i += 1;
742            }
743        } else {
744            i += 1;
745        }
746    }
747
748    (lines, words)
749}
750
751/// Count lines, words, and chars using optimized strategies per locale.
752pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
753    if utf8 {
754        // Fused single-pass for lines+words, then fast char-counting pass
755        let (lines, words) = count_lines_words_utf8_fused(data);
756        let chars = count_chars_utf8(data);
757        (lines, words, chars)
758    } else {
759        // C locale: use optimized fused lines+words, chars = byte count
760        let (lines, words) = count_lines_words(data, false);
761        (lines, words, data.len() as u64)
762    }
763}
764
765/// Count UTF-8 characters by counting non-continuation bytes.
766/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
767/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
768///
769/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
770/// Falls back to 64-byte block processing with popcount on other architectures.
771pub fn count_chars_utf8(data: &[u8]) -> u64 {
772    #[cfg(target_arch = "x86_64")]
773    {
774        if is_x86_feature_detected!("avx2") {
775            return unsafe { count_chars_utf8_avx2(data) };
776        }
777    }
778    count_chars_utf8_scalar(data)
779}
780
781/// AVX2 SIMD character counter: counts non-continuation bytes using
782/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
783/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
784#[cfg(target_arch = "x86_64")]
785#[target_feature(enable = "avx2")]
786unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
787    unsafe {
788        use std::arch::x86_64::*;
789
790        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
791        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
792        let ones = _mm256_set1_epi8(1);
793        let zero = _mm256_setzero_si256();
794
795        let mut total = 0u64;
796        let len = data.len();
797        let ptr = data.as_ptr();
798        let mut i = 0;
799        let mut acc = _mm256_setzero_si256();
800        let mut batch = 0u32;
801
802        while i + 32 <= len {
803            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
804            let masked = _mm256_and_si256(v, mask_c0);
805            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
806            let non_cont = _mm256_andnot_si256(is_cont, ones);
807            acc = _mm256_add_epi8(acc, non_cont);
808
809            batch += 1;
810            if batch >= 255 {
811                // Horizontal sum via PSADBW: sum u8 differences against zero
812                let sad = _mm256_sad_epu8(acc, zero);
813                let hi = _mm256_extracti128_si256(sad, 1);
814                let lo = _mm256_castsi256_si128(sad);
815                let sum = _mm_add_epi64(lo, hi);
816                let hi64 = _mm_unpackhi_epi64(sum, sum);
817                let t = _mm_add_epi64(sum, hi64);
818                total += _mm_cvtsi128_si64(t) as u64;
819                acc = _mm256_setzero_si256();
820                batch = 0;
821            }
822            i += 32;
823        }
824
825        // Final horizontal sum
826        if batch > 0 {
827            let sad = _mm256_sad_epu8(acc, zero);
828            let hi = _mm256_extracti128_si256(sad, 1);
829            let lo = _mm256_castsi256_si128(sad);
830            let sum = _mm_add_epi64(lo, hi);
831            let hi64 = _mm_unpackhi_epi64(sum, sum);
832            let t = _mm_add_epi64(sum, hi64);
833            total += _mm_cvtsi128_si64(t) as u64;
834        }
835
836        while i < len {
837            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
838            i += 1;
839        }
840
841        total
842    }
843}
844
845/// Scalar fallback for count_chars_utf8.
846fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
847    let mut count = 0u64;
848    let chunks = data.chunks_exact(64);
849    let remainder = chunks.remainder();
850
851    for chunk in chunks {
852        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
853        let mut any_high = 0u8;
854        let mut i = 0;
855        while i + 8 <= 64 {
856            unsafe {
857                any_high |= *chunk.get_unchecked(i);
858                any_high |= *chunk.get_unchecked(i + 1);
859                any_high |= *chunk.get_unchecked(i + 2);
860                any_high |= *chunk.get_unchecked(i + 3);
861                any_high |= *chunk.get_unchecked(i + 4);
862                any_high |= *chunk.get_unchecked(i + 5);
863                any_high |= *chunk.get_unchecked(i + 6);
864                any_high |= *chunk.get_unchecked(i + 7);
865            }
866            i += 8;
867        }
868        if any_high < 0x80 {
869            count += 64;
870            continue;
871        }
872
873        let mut char_mask = 0u64;
874        i = 0;
875        while i + 7 < 64 {
876            unsafe {
877                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
878                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
879                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
880                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
881                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
882                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
883                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
884                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
885            }
886            i += 8;
887        }
888        count += char_mask.count_ones() as u64;
889    }
890
891    for &b in remainder {
892        count += ((b & 0xC0) != 0x80) as u64;
893    }
894    count
895}
896
897/// Count characters in C/POSIX locale (each byte is one character).
898#[inline]
899pub fn count_chars_c(data: &[u8]) -> u64 {
900    data.len() as u64
901}
902
903/// Count characters, choosing behavior based on locale.
904#[inline]
905pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
906    if utf8 {
907        count_chars_utf8(data)
908    } else {
909        count_chars_c(data)
910    }
911}
912
913/// Detect if the current locale uses UTF-8 encoding.
914pub fn is_utf8_locale() -> bool {
915    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
916        if let Ok(val) = std::env::var(var) {
917            if !val.is_empty() {
918                let lower = val.to_ascii_lowercase();
919                return lower.contains("utf-8") || lower.contains("utf8");
920            }
921        }
922    }
923    false
924}
925
926/// Decode one UTF-8 character from a byte slice.
927/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
928#[inline]
929fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
930    let b0 = bytes[0];
931    if b0 < 0x80 {
932        return (b0 as u32, 1);
933    }
934    if b0 < 0xC2 {
935        // Continuation byte or overlong 2-byte — invalid as start
936        return (b0 as u32, 1);
937    }
938    if b0 < 0xE0 {
939        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
940            return (b0 as u32, 1);
941        }
942        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
943        return (cp, 2);
944    }
945    if b0 < 0xF0 {
946        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
947            return (b0 as u32, 1);
948        }
949        let cp =
950            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
951        return (cp, 3);
952    }
953    if b0 < 0xF5 {
954        if bytes.len() < 4
955            || bytes[1] & 0xC0 != 0x80
956            || bytes[2] & 0xC0 != 0x80
957            || bytes[3] & 0xC0 != 0x80
958        {
959            return (b0 as u32, 1);
960        }
961        let cp = ((b0 as u32 & 0x07) << 18)
962            | ((bytes[1] as u32 & 0x3F) << 12)
963            | ((bytes[2] as u32 & 0x3F) << 6)
964            | (bytes[3] as u32 & 0x3F);
965        return (cp, 4);
966    }
967    (b0 as u32, 1)
968}
969
970/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
971/// GNU wc uses wcwidth() which returns 0 for these. We must match.
972#[inline]
973fn is_zero_width(cp: u32) -> bool {
974    matches!(
975        cp,
976        0x0300..=0x036F   // Combining Diacritical Marks
977        | 0x0483..=0x0489 // Cyrillic combining marks
978        | 0x0591..=0x05BD // Hebrew combining marks
979        | 0x05BF
980        | 0x05C1..=0x05C2
981        | 0x05C4..=0x05C5
982        | 0x05C7
983        | 0x0600..=0x0605 // Arabic number signs
984        | 0x0610..=0x061A // Arabic combining marks
985        | 0x064B..=0x065F // Arabic combining marks
986        | 0x0670
987        | 0x06D6..=0x06DD
988        | 0x06DF..=0x06E4
989        | 0x06E7..=0x06E8
990        | 0x06EA..=0x06ED
991        | 0x070F
992        | 0x0711
993        | 0x0730..=0x074A
994        | 0x07A6..=0x07B0
995        | 0x07EB..=0x07F3
996        | 0x07FD
997        | 0x0816..=0x0819
998        | 0x081B..=0x0823
999        | 0x0825..=0x0827
1000        | 0x0829..=0x082D
1001        | 0x0859..=0x085B
1002        | 0x08D3..=0x08E1
1003        | 0x08E3..=0x0902
1004        | 0x093A
1005        | 0x093C
1006        | 0x0941..=0x0948
1007        | 0x094D
1008        | 0x0951..=0x0957
1009        | 0x0962..=0x0963
1010        | 0x0981
1011        | 0x09BC
1012        | 0x09C1..=0x09C4
1013        | 0x09CD
1014        | 0x09E2..=0x09E3
1015        | 0x09FE
1016        | 0x0A01..=0x0A02
1017        | 0x0A3C
1018        | 0x0A41..=0x0A42
1019        | 0x0A47..=0x0A48
1020        | 0x0A4B..=0x0A4D
1021        | 0x0A51
1022        | 0x0A70..=0x0A71
1023        | 0x0A75
1024        | 0x0A81..=0x0A82
1025        | 0x0ABC
1026        | 0x0AC1..=0x0AC5
1027        | 0x0AC7..=0x0AC8
1028        | 0x0ACD
1029        | 0x0AE2..=0x0AE3
1030        | 0x0AFA..=0x0AFF
1031        | 0x0B01
1032        | 0x0B3C
1033        | 0x0B3F
1034        | 0x0B41..=0x0B44
1035        | 0x0B4D
1036        | 0x0B56
1037        | 0x0B62..=0x0B63
1038        | 0x0B82
1039        | 0x0BC0
1040        | 0x0BCD
1041        | 0x0C00
1042        | 0x0C04
1043        | 0x0C3E..=0x0C40
1044        | 0x0C46..=0x0C48
1045        | 0x0C4A..=0x0C4D
1046        | 0x0C55..=0x0C56
1047        | 0x0C62..=0x0C63
1048        | 0x0C81
1049        | 0x0CBC
1050        | 0x0CBF
1051        | 0x0CC6
1052        | 0x0CCC..=0x0CCD
1053        | 0x0CE2..=0x0CE3
1054        | 0x0D00..=0x0D01
1055        | 0x0D3B..=0x0D3C
1056        | 0x0D41..=0x0D44
1057        | 0x0D4D
1058        | 0x0D62..=0x0D63
1059        | 0x0DCA
1060        | 0x0DD2..=0x0DD4
1061        | 0x0DD6
1062        | 0x0E31
1063        | 0x0E34..=0x0E3A
1064        | 0x0E47..=0x0E4E
1065        | 0x0EB1
1066        | 0x0EB4..=0x0EBC
1067        | 0x0EC8..=0x0ECD
1068        | 0x0F18..=0x0F19
1069        | 0x0F35
1070        | 0x0F37
1071        | 0x0F39
1072        | 0x0F71..=0x0F7E
1073        | 0x0F80..=0x0F84
1074        | 0x0F86..=0x0F87
1075        | 0x0F8D..=0x0F97
1076        | 0x0F99..=0x0FBC
1077        | 0x0FC6
1078        | 0x102D..=0x1030
1079        | 0x1032..=0x1037
1080        | 0x1039..=0x103A
1081        | 0x103D..=0x103E
1082        | 0x1058..=0x1059
1083        | 0x105E..=0x1060
1084        | 0x1071..=0x1074
1085        | 0x1082
1086        | 0x1085..=0x1086
1087        | 0x108D
1088        | 0x109D
1089        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
1090        | 0x135D..=0x135F
1091        | 0x1712..=0x1714
1092        | 0x1732..=0x1734
1093        | 0x1752..=0x1753
1094        | 0x1772..=0x1773
1095        | 0x17B4..=0x17B5
1096        | 0x17B7..=0x17BD
1097        | 0x17C6
1098        | 0x17C9..=0x17D3
1099        | 0x17DD
1100        | 0x180B..=0x180D
1101        | 0x1885..=0x1886
1102        | 0x18A9
1103        | 0x1920..=0x1922
1104        | 0x1927..=0x1928
1105        | 0x1932
1106        | 0x1939..=0x193B
1107        | 0x1A17..=0x1A18
1108        | 0x1A1B
1109        | 0x1A56
1110        | 0x1A58..=0x1A5E
1111        | 0x1A60
1112        | 0x1A62
1113        | 0x1A65..=0x1A6C
1114        | 0x1A73..=0x1A7C
1115        | 0x1A7F
1116        | 0x1AB0..=0x1ABE
1117        | 0x1B00..=0x1B03
1118        | 0x1B34
1119        | 0x1B36..=0x1B3A
1120        | 0x1B3C
1121        | 0x1B42
1122        | 0x1B6B..=0x1B73
1123        | 0x1B80..=0x1B81
1124        | 0x1BA2..=0x1BA5
1125        | 0x1BA8..=0x1BA9
1126        | 0x1BAB..=0x1BAD
1127        | 0x1BE6
1128        | 0x1BE8..=0x1BE9
1129        | 0x1BED
1130        | 0x1BEF..=0x1BF1
1131        | 0x1C2C..=0x1C33
1132        | 0x1C36..=0x1C37
1133        | 0x1CD0..=0x1CD2
1134        | 0x1CD4..=0x1CE0
1135        | 0x1CE2..=0x1CE8
1136        | 0x1CED
1137        | 0x1CF4
1138        | 0x1CF8..=0x1CF9
1139        | 0x1DC0..=0x1DF9
1140        | 0x1DFB..=0x1DFF
1141        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
1142        | 0x202A..=0x202E // Bidi control chars
1143        | 0x2060..=0x2064 // Word joiner, invisible operators
1144        | 0x2066..=0x206F // Bidi isolates
1145        | 0x20D0..=0x20F0 // Combining marks for symbols
1146        | 0xFE00..=0xFE0F // Variation Selectors
1147        | 0xFE20..=0xFE2F // Combining Half Marks
1148        | 0xFEFF          // Zero Width No-Break Space (BOM)
1149        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
1150        | 0x1D167..=0x1D169
1151        | 0x1D173..=0x1D182
1152        | 0x1D185..=0x1D18B
1153        | 0x1D1AA..=0x1D1AD
1154        | 0x1D242..=0x1D244
1155        | 0xE0001
1156        | 0xE0020..=0xE007F
1157        | 0xE0100..=0xE01EF // Variation Selectors Supplement
1158    )
1159}
1160
1161/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
1162/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
1163#[inline]
1164fn is_wide_char(cp: u32) -> bool {
1165    matches!(
1166        cp,
1167        0x1100..=0x115F   // Hangul Jamo
1168        | 0x231A..=0x231B // Watch, Hourglass
1169        | 0x2329..=0x232A // Angle Brackets
1170        | 0x23E9..=0x23F3 // Various symbols
1171        | 0x23F8..=0x23FA
1172        | 0x25FD..=0x25FE
1173        | 0x2614..=0x2615
1174        | 0x2648..=0x2653
1175        | 0x267F
1176        | 0x2693
1177        | 0x26A1
1178        | 0x26AA..=0x26AB
1179        | 0x26BD..=0x26BE
1180        | 0x26C4..=0x26C5
1181        | 0x26CE
1182        | 0x26D4
1183        | 0x26EA
1184        | 0x26F2..=0x26F3
1185        | 0x26F5
1186        | 0x26FA
1187        | 0x26FD
1188        | 0x2702
1189        | 0x2705
1190        | 0x2708..=0x270D
1191        | 0x270F
1192        | 0x2712
1193        | 0x2714
1194        | 0x2716
1195        | 0x271D
1196        | 0x2721
1197        | 0x2728
1198        | 0x2733..=0x2734
1199        | 0x2744
1200        | 0x2747
1201        | 0x274C
1202        | 0x274E
1203        | 0x2753..=0x2755
1204        | 0x2757
1205        | 0x2763..=0x2764
1206        | 0x2795..=0x2797
1207        | 0x27A1
1208        | 0x27B0
1209        | 0x27BF
1210        | 0x2934..=0x2935
1211        | 0x2B05..=0x2B07
1212        | 0x2B1B..=0x2B1C
1213        | 0x2B50
1214        | 0x2B55
1215        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
1216        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
1217        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1218        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1219        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1220        | 0xAC00..=0xD7A3  // Hangul Syllables
1221        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1222        | 0xFE10..=0xFE19  // Vertical Forms
1223        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1224        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1225        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1226        | 0x1F004
1227        | 0x1F0CF
1228        | 0x1F170..=0x1F171
1229        | 0x1F17E..=0x1F17F
1230        | 0x1F18E
1231        | 0x1F191..=0x1F19A
1232        | 0x1F1E0..=0x1F1FF // Regional Indicators
1233        | 0x1F200..=0x1F202
1234        | 0x1F210..=0x1F23B
1235        | 0x1F240..=0x1F248
1236        | 0x1F250..=0x1F251
1237        | 0x1F260..=0x1F265
1238        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1239        | 0x1F680..=0x1F6FF // Transport Symbols
1240        | 0x1F900..=0x1F9FF // Supplemental Symbols
1241        | 0x1FA00..=0x1FA6F
1242        | 0x1FA70..=0x1FAFF
1243        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1244        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1245    )
1246}
1247
1248/// Compute maximum display width of any line (C/POSIX locale).
1249///
1250/// GNU wc -L behavior in C locale:
1251/// - `\n`: line terminator (records max, resets position)
1252/// - `\t`: advances to next tab stop (multiple of 8)
1253/// - `\r`: carriage return (resets position to 0, same line)
1254/// - `\f`: form feed (acts as line terminator like \n)
1255/// - Printable ASCII (0x20..0x7E): width 1
1256/// - Everything else (controls, high bytes): width 0
1257///
1258/// Optimized with printable ASCII run counting: for runs of bytes in
1259/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1260pub fn max_line_length_c(data: &[u8]) -> u64 {
1261    let mut max_len: u64 = 0;
1262    let mut line_len: u64 = 0;
1263    let mut linepos: u64 = 0;
1264    let mut i = 0;
1265    let len = data.len();
1266
1267    while i < len {
1268        let b = unsafe { *data.get_unchecked(i) };
1269        if b >= 0x21 && b <= 0x7E {
1270            // Printable non-space ASCII — count run length
1271            i += 1;
1272            let mut run = 1u64;
1273            while i < len {
1274                let b = unsafe { *data.get_unchecked(i) };
1275                if b >= 0x21 && b <= 0x7E {
1276                    run += 1;
1277                    i += 1;
1278                } else {
1279                    break;
1280                }
1281            }
1282            linepos += run;
1283            if linepos > line_len {
1284                line_len = linepos;
1285            }
1286        } else {
1287            match b {
1288                b' ' => {
1289                    linepos += 1;
1290                    if linepos > line_len {
1291                        line_len = linepos;
1292                    }
1293                }
1294                b'\n' => {
1295                    if line_len > max_len {
1296                        max_len = line_len;
1297                    }
1298                    linepos = 0;
1299                    line_len = 0;
1300                }
1301                b'\t' => {
1302                    linepos = (linepos + 8) & !7;
1303                    if linepos > line_len {
1304                        line_len = linepos;
1305                    }
1306                }
1307                b'\r' => {
1308                    linepos = 0;
1309                }
1310                0x0C => {
1311                    if line_len > max_len {
1312                        max_len = line_len;
1313                    }
1314                    linepos = 0;
1315                    line_len = 0;
1316                }
1317                _ => {} // Non-printable: width 0
1318            }
1319            i += 1;
1320        }
1321    }
1322
1323    if line_len > max_len {
1324        max_len = line_len;
1325    }
1326
1327    max_len
1328}
1329
1330/// Compute maximum display width of any line (UTF-8 locale).
1331///
1332/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1333/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1334///
1335/// Optimized with printable ASCII run counting for common text.
1336pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1337    let mut max_len: u64 = 0;
1338    let mut line_len: u64 = 0;
1339    let mut linepos: u64 = 0;
1340    let mut i = 0;
1341    let len = data.len();
1342
1343    while i < len {
1344        let b = unsafe { *data.get_unchecked(i) };
1345
1346        if b >= 0x21 && b <= 0x7E {
1347            // Printable non-space ASCII (most common) — count run length
1348            i += 1;
1349            let mut run = 1u64;
1350            while i < len {
1351                let b = unsafe { *data.get_unchecked(i) };
1352                if b >= 0x21 && b <= 0x7E {
1353                    run += 1;
1354                    i += 1;
1355                } else {
1356                    break;
1357                }
1358            }
1359            linepos += run;
1360            if linepos > line_len {
1361                line_len = linepos;
1362            }
1363        } else if b < 0x80 {
1364            // Other ASCII: space, tab, newline, controls
1365            match b {
1366                b' ' => {
1367                    linepos += 1;
1368                    if linepos > line_len {
1369                        line_len = linepos;
1370                    }
1371                }
1372                b'\n' => {
1373                    if line_len > max_len {
1374                        max_len = line_len;
1375                    }
1376                    linepos = 0;
1377                    line_len = 0;
1378                }
1379                b'\t' => {
1380                    linepos = (linepos + 8) & !7;
1381                    if linepos > line_len {
1382                        line_len = linepos;
1383                    }
1384                }
1385                b'\r' => {
1386                    linepos = 0;
1387                }
1388                0x0C => {
1389                    if line_len > max_len {
1390                        max_len = line_len;
1391                    }
1392                    linepos = 0;
1393                    line_len = 0;
1394                }
1395                _ => {} // Non-printable: width 0
1396            }
1397            i += 1;
1398        } else {
1399            // Multibyte UTF-8
1400            let (cp, len) = decode_utf8(&data[i..]);
1401
1402            // C1 control characters (0x80..0x9F): non-printable, width 0
1403            if cp <= 0x9F {
1404                // width 0
1405            } else if is_zero_width(cp) {
1406                // Combining marks, zero-width chars: width 0
1407            } else if is_wide_char(cp) {
1408                linepos += 2;
1409                if linepos > line_len {
1410                    line_len = linepos;
1411                }
1412            } else {
1413                // Regular printable Unicode character: width 1
1414                linepos += 1;
1415                if linepos > line_len {
1416                    line_len = linepos;
1417                }
1418            }
1419            i += len;
1420        }
1421    }
1422
1423    // Handle last line
1424    if line_len > max_len {
1425        max_len = line_len;
1426    }
1427
1428    max_len
1429}
1430
1431/// Compute maximum display width, choosing behavior based on locale.
1432#[inline]
1433pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1434    if utf8 {
1435        max_line_length_utf8(data)
1436    } else {
1437        max_line_length_c(data)
1438    }
1439}
1440
1441/// Count all metrics using optimized individual passes.
1442///
1443/// Each metric uses its own optimized algorithm:
1444/// - Lines: SIMD-accelerated memchr
1445/// - Words: 3-state scalar/state-machine (locale-dependent)
1446/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1447/// - Max line length: locale-aware display width tracking
1448///
1449/// Multi-pass is faster than single-pass because each pass has a tight,
1450/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1451/// making subsequent passes nearly free for memory bandwidth.
1452pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1453    if utf8 {
1454        let (lines, words) = count_lines_words_utf8_fused(data);
1455        WcCounts {
1456            lines,
1457            words,
1458            bytes: data.len() as u64,
1459            chars: count_chars_utf8(data),
1460            max_line_length: max_line_length_utf8(data),
1461        }
1462    } else {
1463        WcCounts {
1464            lines: count_lines(data),
1465            words: count_words_locale(data, false),
1466            bytes: data.len() as u64,
1467            chars: data.len() as u64,
1468            max_line_length: max_line_length_c(data),
1469        }
1470    }
1471}
1472
1473/// Quick check if data is likely all-ASCII by sampling three regions.
1474/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1475/// If any byte >= 0x80 is found, returns false.
1476#[inline]
1477fn check_ascii_sample(data: &[u8]) -> bool {
1478    let len = data.len();
1479    if len == 0 {
1480        return true;
1481    }
1482
1483    // Check in 8-byte blocks using OR-accumulation for speed
1484    let check_region = |start: usize, end: usize| -> bool {
1485        let mut or_acc = 0u8;
1486        let region = &data[start..end];
1487        let mut i = 0;
1488        while i + 8 <= region.len() {
1489            unsafe {
1490                or_acc |= *region.get_unchecked(i);
1491                or_acc |= *region.get_unchecked(i + 1);
1492                or_acc |= *region.get_unchecked(i + 2);
1493                or_acc |= *region.get_unchecked(i + 3);
1494                or_acc |= *region.get_unchecked(i + 4);
1495                or_acc |= *region.get_unchecked(i + 5);
1496                or_acc |= *region.get_unchecked(i + 6);
1497                or_acc |= *region.get_unchecked(i + 7);
1498            }
1499            i += 8;
1500        }
1501        while i < region.len() {
1502            or_acc |= region[i];
1503            i += 1;
1504        }
1505        or_acc < 0x80
1506    };
1507
1508    let sample = 256.min(len);
1509
1510    // Check beginning
1511    if !check_region(0, sample) {
1512        return false;
1513    }
1514    // Check middle
1515    if len > sample * 2 {
1516        let mid = len / 2;
1517        let mid_start = mid.saturating_sub(sample / 2);
1518        if !check_region(mid_start, (mid_start + sample).min(len)) {
1519            return false;
1520        }
1521    }
1522    // Check end
1523    if len > sample {
1524        if !check_region(len - sample, len) {
1525            return false;
1526        }
1527    }
1528
1529    true
1530}
1531
1532// ──────────────────────────────────────────────────
1533// Parallel counting for large files
1534// ──────────────────────────────────────────────────
1535
1536/// Split data into chunks at newline boundaries for parallel processing.
1537/// Returns slices where each slice (except possibly the last) ends with `\n`.
1538/// Splitting at newlines guarantees word boundaries in any locale,
1539/// enabling safe parallel word counting without boundary adjustment.
1540fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1541    if data.is_empty() || num_chunks <= 1 {
1542        return vec![data];
1543    }
1544    let chunk_size = data.len() / num_chunks;
1545    let mut chunks = Vec::with_capacity(num_chunks);
1546    let mut pos = 0;
1547
1548    for _ in 0..num_chunks - 1 {
1549        let target = pos + chunk_size;
1550        if target >= data.len() {
1551            break;
1552        }
1553        let boundary = memchr::memchr(b'\n', &data[target..])
1554            .map(|p| target + p + 1)
1555            .unwrap_or(data.len());
1556        if boundary > pos {
1557            chunks.push(&data[pos..boundary]);
1558        }
1559        pos = boundary;
1560    }
1561    if pos < data.len() {
1562        chunks.push(&data[pos..]);
1563    }
1564    chunks
1565}
1566
1567/// Count newlines in parallel using SIMD memchr + rayon.
1568/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1569pub fn count_lines_parallel(data: &[u8]) -> u64 {
1570    if data.len() < PARALLEL_THRESHOLD {
1571        return count_lines(data);
1572    }
1573
1574    let num_threads = rayon::current_num_threads().max(1);
1575    // Ensure chunks are large enough to amortize SIMD setup overhead
1576    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1577
1578    data.par_chunks(chunk_size)
1579        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1580        .sum()
1581}
1582
1583/// Count words in parallel with boundary adjustment.
1584pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1585    if data.len() < PARALLEL_THRESHOLD {
1586        return count_words_locale(data, utf8);
1587    }
1588
1589    let num_threads = rayon::current_num_threads().max(1);
1590
1591    if utf8 {
1592        // UTF-8: split at newline boundaries for safe parallel word counting.
1593        // Newlines are always word boundaries, so no boundary adjustment needed.
1594        let chunks = split_at_newlines(data, num_threads);
1595        chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1596    } else {
1597        // C locale: parallel 3-state word counting with boundary adjustment
1598        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1599
1600        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1601
1602        // Each chunk returns (lines, word_count, first_active_is_printable, ends_in_word)
1603        let results: Vec<(u64, u64, bool, bool)> = chunks
1604            .par_iter()
1605            .map(|chunk| count_lw_c_chunk(chunk))
1606            .collect();
1607
1608        let mut total = 0u64;
1609        for i in 0..results.len() {
1610            total += results[i].1;
1611            // Boundary adjustment: if previous chunk ended in_word AND
1612            // current chunk's first non-transparent byte is printable,
1613            // the word was split across chunks — subtract the overcount.
1614            if i > 0 && results[i - 1].3 && results[i].2 {
1615                total -= 1;
1616            }
1617        }
1618        total
1619    }
1620}
1621
1622/// Count UTF-8 characters in parallel.
1623pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1624    if !utf8 {
1625        return data.len() as u64;
1626    }
1627    if data.len() < PARALLEL_THRESHOLD {
1628        return count_chars_utf8(data);
1629    }
1630
1631    let num_threads = rayon::current_num_threads().max(1);
1632    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1633
1634    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1635}
1636
1637/// Count lines + words + bytes in a single fused pass (the default wc mode).
1638/// Avoids separate passes entirely — combines newline counting with word detection.
1639pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1640    let (lines, words) = count_lines_words(data, utf8);
1641    (lines, words, data.len() as u64)
1642}
1643
1644/// Parallel counting of lines + words + bytes only (no chars).
1645/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1646/// C locale: single fused pass per chunk counts BOTH lines and words.
1647/// UTF-8: checks ASCII first for C locale fast path, else splits at newlines
1648/// for safe parallel UTF-8 word counting.
1649pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1650    if data.len() < PARALLEL_THRESHOLD {
1651        // Small file: use fused single-pass
1652        return count_lwb(data, utf8);
1653    }
1654
1655    let num_threads = rayon::current_num_threads().max(1);
1656
1657    let (lines, words) = if !utf8 {
1658        // C locale: FUSED parallel lines+words counting — single pass per chunk
1659        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1660
1661        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1662        let results: Vec<(u64, u64, bool, bool)> = chunks
1663            .par_iter()
1664            .map(|chunk| count_lw_c_chunk_fast(chunk))
1665            .collect();
1666
1667        let mut line_total = 0u64;
1668        let mut word_total = 0u64;
1669        for i in 0..results.len() {
1670            line_total += results[i].0;
1671            word_total += results[i].1;
1672            if i > 0 && results[i - 1].3 && results[i].2 {
1673                word_total -= 1;
1674            }
1675        }
1676
1677        (line_total, word_total)
1678    } else {
1679        // UTF-8 locale: check if ASCII for faster C locale path
1680        let is_ascii = check_ascii_sample(data);
1681        if is_ascii {
1682            // Pure ASCII: use C locale parallel path (arbitrary chunks OK)
1683            let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1684            let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1685            let results: Vec<(u64, u64, bool, bool)> = chunks
1686                .par_iter()
1687                .map(|chunk| count_lw_c_chunk_fast(chunk))
1688                .collect();
1689
1690            let mut line_total = 0u64;
1691            let mut word_total = 0u64;
1692            for i in 0..results.len() {
1693                line_total += results[i].0;
1694                word_total += results[i].1;
1695                if i > 0 && results[i - 1].3 && results[i].2 {
1696                    word_total -= 1;
1697                }
1698            }
1699            (line_total, word_total)
1700        } else {
1701            // Non-ASCII UTF-8: split at newline boundaries for safe parallel
1702            // word counting. Newlines always break words, so no adjustment needed.
1703            let chunks = split_at_newlines(data, num_threads);
1704            let results: Vec<(u64, u64)> = chunks
1705                .par_iter()
1706                .map(|chunk| count_lines_words_utf8_fused(chunk))
1707                .collect();
1708            let mut line_total = 0u64;
1709            let mut word_total = 0u64;
1710            for (l, w) in results {
1711                line_total += l;
1712                word_total += w;
1713            }
1714            (line_total, word_total)
1715        }
1716    };
1717
1718    (lines, words, data.len() as u64)
1719}
1720
1721/// Combined parallel counting of lines + words + chars.
1722/// UTF-8: splits at newline boundaries for fused lines+words+chars per chunk.
1723/// C locale: fused parallel lines+words with boundary adjustment + parallel chars.
1724pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1725    if data.len() < PARALLEL_THRESHOLD {
1726        let lines = count_lines(data);
1727        let words = count_words_locale(data, utf8);
1728        let chars = count_chars(data, utf8);
1729        return (lines, words, chars);
1730    }
1731
1732    let num_threads = rayon::current_num_threads().max(1);
1733
1734    if utf8 {
1735        // UTF-8: fused parallel lines+words+chars per chunk (split at newlines)
1736        let chunks = split_at_newlines(data, num_threads);
1737        let results: Vec<(u64, u64, u64)> = chunks
1738            .par_iter()
1739            .map(|chunk| {
1740                let (lines, words) = count_lines_words_utf8_fused(chunk);
1741                let chars = count_chars_utf8(chunk);
1742                (lines, words, chars)
1743            })
1744            .collect();
1745        let mut lines = 0u64;
1746        let mut words = 0u64;
1747        let mut chars = 0u64;
1748        for (l, w, c) in results {
1749            lines += l;
1750            words += w;
1751            chars += c;
1752        }
1753        (lines, words, chars)
1754    } else {
1755        // C locale: fused parallel lines+words + parallel chars (= byte count)
1756        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1757        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1758        let results: Vec<(u64, u64, bool, bool)> = chunks
1759            .par_iter()
1760            .map(|chunk| count_lw_c_chunk_fast(chunk))
1761            .collect();
1762        let mut lines = 0u64;
1763        let mut words = 0u64;
1764        for i in 0..results.len() {
1765            lines += results[i].0;
1766            words += results[i].1;
1767            if i > 0 && results[i - 1].3 && results[i].2 {
1768                words -= 1;
1769            }
1770        }
1771        (lines, words, data.len() as u64)
1772    }
1773}
1774
1775/// Parallel max line length computation.
1776/// Splits at newline boundaries so each chunk independently computes correct
1777/// max line width (since newlines reset position tracking).
1778pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1779    if data.len() < PARALLEL_THRESHOLD {
1780        return max_line_length(data, utf8);
1781    }
1782    let num_threads = rayon::current_num_threads().max(1);
1783    let chunks = split_at_newlines(data, num_threads);
1784    chunks
1785        .par_iter()
1786        .map(|chunk| {
1787            if utf8 {
1788                max_line_length_utf8(chunk)
1789            } else {
1790                max_line_length_c(chunk)
1791            }
1792        })
1793        .max()
1794        .unwrap_or(0)
1795}
1796
1797/// Parallel counting of all metrics at once.
1798/// Splits at newline boundaries for safe parallel word + max_line_length counting.
1799/// Each chunk computes all metrics in a single traversal group, maximizing cache reuse.
1800pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1801    if data.len() < PARALLEL_THRESHOLD {
1802        return count_all(data, utf8);
1803    }
1804
1805    let num_threads = rayon::current_num_threads().max(1);
1806    let chunks = split_at_newlines(data, num_threads);
1807
1808    if utf8 {
1809        let results: Vec<(u64, u64, u64, u64)> = chunks
1810            .par_iter()
1811            .map(|chunk| {
1812                let (lines, words) = count_lines_words_utf8_fused(chunk);
1813                let chars = count_chars_utf8(chunk);
1814                let max_ll = max_line_length_utf8(chunk);
1815                (lines, words, chars, max_ll)
1816            })
1817            .collect();
1818
1819        let mut counts = WcCounts {
1820            bytes: data.len() as u64,
1821            ..Default::default()
1822        };
1823        for (l, w, c, m) in results {
1824            counts.lines += l;
1825            counts.words += w;
1826            counts.chars += c;
1827            if m > counts.max_line_length {
1828                counts.max_line_length = m;
1829            }
1830        }
1831        counts
1832    } else {
1833        // C locale: fused lines+words per chunk + max_line_length per chunk
1834        let results: Vec<(u64, u64, u64)> = chunks
1835            .par_iter()
1836            .map(|chunk| {
1837                let (lines, words) = count_lines_words(chunk, false);
1838                let max_ll = max_line_length_c(chunk);
1839                (lines, words, max_ll)
1840            })
1841            .collect();
1842
1843        let mut counts = WcCounts {
1844            bytes: data.len() as u64,
1845            chars: data.len() as u64,
1846            ..Default::default()
1847        };
1848        for (l, w, m) in &results {
1849            counts.lines += l;
1850            counts.words += w;
1851            if *m > counts.max_line_length {
1852                counts.max_line_length = *m;
1853            }
1854        }
1855        counts
1856    }
1857}