Skip to main content

coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (1MB).
5/// Rayon overhead is ~5-10μs per task; at 1MB with memchr SIMD (~10 GB/s),
6/// each chunk takes ~100μs, so overhead is < 10%.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Results from counting a byte slice.
10#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12    pub lines: u64,
13    pub words: u64,
14    pub bytes: u64,
15    pub chars: u64,
16    pub max_line_length: u64,
17}
18
19// ──────────────────────────────────────────────────
20// 2-state byte classification for word counting
21// ──────────────────────────────────────────────────
22//
23// GNU wc uses 2-state word counting:
24//   0 = word content: starts or continues a word (any non-whitespace byte)
25//   1 = space (word break): ends any current word
26//
27// Whitespace bytes (C locale): 0x09 TAB, 0x0A LF, 0x0B VT, 0x0C FF, 0x0D CR, 0x20 SPACE, 0xA0.
28// Everything else (including NUL, control chars, high bytes 0x80-0xFF except 0xA0) is word content.
29
30/// Byte classification for C/POSIX locale word counting.
31/// GNU wc treats whitespace as word breaks and everything else as word content.
32/// In C locale, whitespace bytes are 0x09-0x0D and 0x20 (matching POSIX isspace).
33///   0 = word content: starts or continues a word
34///   1 = space (word break): ends any current word
35const fn make_byte_class_c() -> [u8; 256] {
36    // GNU wc C locale: 0x09-0x0D, 0x20, AND 0xA0 break words.
37    // Verified on GNU coreutils 9.7: `printf 'a\xa0b' | env LC_ALL=C wc -w` => 2
38    // Note: `echo -e '\xe4\xbd\xa0' | LC_ALL=C wc -w` = 1 is NOT a distinguishing
39    // test (gives 1 regardless of 0xA0 treatment since nothing follows it).
40    // 0xA0 is the final byte of '你' (U+4F60 = E4 BD A0), so it splits adjacent CJK.
41    let mut t = make_byte_class_utf8();
42    t[0xA0] = 1;
43    t
44}
45const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
46
47/// 2-state single-byte classification for UTF-8 locale.
48/// Multi-byte UTF-8 sequences are handled by the state machine separately.
49const fn make_byte_class_utf8() -> [u8; 256] {
50    let mut t = [0u8; 256]; // default: word content
51    // Spaces (only these break words — everything else including NUL is word content)
52    t[0x09] = 1; // \t
53    t[0x0A] = 1; // \n
54    t[0x0B] = 1; // \v
55    t[0x0C] = 1; // \f
56    t[0x0D] = 1; // \r
57    t[0x20] = 1; // space
58    t
59}
60
61const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
62
63// ──────────────────────────────────────────────────
64// Unicode character classification helpers
65// ──────────────────────────────────────────────────
66
67/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
68/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
69#[inline]
70fn is_unicode_space(cp: u32) -> bool {
71    matches!(
72        cp,
73        0x00A0 |           // No-Break Space
74        0x1680 |           // Ogham Space Mark
75        0x2000
76            ..=0x200A |  // En Quad through Hair Space
77        0x2028 |           // Line Separator
78        0x2029 |           // Paragraph Separator
79        0x202F |           // Narrow No-Break Space
80        0x205F |           // Medium Mathematical Space
81        0x3000 // Ideographic Space
82    )
83}
84
85/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
86/// C1 control characters (U+0080-U+009F) are not printable.
87// ──────────────────────────────────────────────────
88// Core counting functions
89// ──────────────────────────────────────────────────
90
91/// Count newlines using SIMD-accelerated memchr.
92/// GNU wc counts newline bytes (`\n`), not logical lines.
93#[inline]
94pub fn count_lines(data: &[u8]) -> u64 {
95    memchr_iter(b'\n', data).count() as u64
96}
97
98/// Count bytes. Trivial but included for API consistency.
99#[inline]
100pub fn count_bytes(data: &[u8]) -> u64 {
101    data.len() as u64
102}
103
104/// Count words using locale-aware 3-state logic (default: UTF-8).
105pub fn count_words(data: &[u8]) -> u64 {
106    count_words_locale(data, true)
107}
108
109/// Count words with explicit locale control using 2-state logic.
110///
111/// GNU wc classifies each byte/character as:
112///   - space (whitespace): sets in_word=false
113///   - word content (everything else): sets in_word=true, increments word count on transition
114pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
115    if utf8 {
116        count_words_utf8(data)
117    } else {
118        count_words_c(data)
119    }
120}
121
122/// Count words in C/POSIX locale using 2-state logic matching GNU wc.
123/// GNU wc treats bytes as either whitespace (word break) or word content.
124/// Whitespace: 0x09-0x0D, 0x20, 0xA0.
125/// Everything else (including NUL, control chars, high bytes except 0xA0) is word content.
126fn count_words_c(data: &[u8]) -> u64 {
127    let mut words = 0u64;
128    let mut in_word = false;
129    let mut i = 0;
130    let len = data.len();
131
132    while i < len {
133        let b = unsafe { *data.get_unchecked(i) };
134        let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
135        if class == 1 {
136            // Space — break word
137            in_word = false;
138        } else if !in_word {
139            // Word content (any non-space byte)
140            in_word = true;
141            words += 1;
142        }
143        i += 1;
144    }
145    words
146}
147
148/// AVX2-accelerated fused line+word counter for C locale chunks.
149/// Processes 32 bytes per iteration using 2-state logic:
150///   - Space bytes (0x09-0x0D, 0x20, 0xA0): word breaks
151///   - Everything else: word content (starts/continues words)
152/// Word transitions detected via bitmask: word_content_mask & ~prev_word_content_mask.
153#[cfg(target_arch = "x86_64")]
154#[target_feature(enable = "avx2")]
155unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
156    use std::arch::x86_64::*;
157
158    let len = data.len();
159    let ptr = data.as_ptr();
160    let mut i = 0usize;
161    let mut total_lines = 0u64;
162    let mut total_words = 0u64;
163    let mut prev_in_word = false;
164
165    unsafe {
166        let nl_byte = _mm256_set1_epi8(b'\n' as i8);
167        let zero = _mm256_setzero_si256();
168        let ones = _mm256_set1_epi8(1);
169        // Space detection: 0x09-0x0D, 0x20, and 0xA0 (GNU wc C locale); NUL is word content
170        let space_char = _mm256_set1_epi8(0x20i8);
171        let tab_lo = _mm256_set1_epi8(0x08i8);
172        let tab_hi = _mm256_set1_epi8(0x0Ei8);
173        let nbsp_char = _mm256_set1_epi8(0xA0u8 as i8);
174
175        let mut line_acc = _mm256_setzero_si256();
176        let mut batch = 0u32;
177
178        while i + 32 <= len {
179            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
180            let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
181            line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
182
183            // is_space = (v == 0x20) | (v == 0xA0) | (v > 0x08 && v < 0x0E)
184            let is_sp = _mm256_cmpeq_epi8(v, space_char);
185            let is_nbsp = _mm256_cmpeq_epi8(v, nbsp_char);
186            let gt_08 = _mm256_cmpgt_epi8(v, tab_lo);
187            let lt_0e = _mm256_cmpgt_epi8(tab_hi, v);
188            let is_tab_range = _mm256_and_si256(gt_08, lt_0e);
189            let is_space = _mm256_or_si256(_mm256_or_si256(is_sp, is_nbsp), is_tab_range);
190
191            let space_mask = _mm256_movemask_epi8(is_space) as u32;
192            // Word content = NOT space
193            let word_mask = !space_mask;
194
195            // 2-state bitmask approach: count transitions from non-word to word
196            let prev_mask = (word_mask << 1) | (prev_in_word as u32);
197            total_words += (word_mask & !prev_mask).count_ones() as u64;
198            prev_in_word = (word_mask >> 31) & 1 == 1;
199
200            batch += 1;
201            if batch >= 255 {
202                let sad = _mm256_sad_epu8(line_acc, zero);
203                let hi = _mm256_extracti128_si256(sad, 1);
204                let lo = _mm256_castsi256_si128(sad);
205                let s = _mm_add_epi64(lo, hi);
206                let h64 = _mm_unpackhi_epi64(s, s);
207                let t = _mm_add_epi64(s, h64);
208                total_lines += _mm_cvtsi128_si64(t) as u64;
209                line_acc = _mm256_setzero_si256();
210                batch = 0;
211            }
212            i += 32;
213        }
214
215        if batch > 0 {
216            let sad = _mm256_sad_epu8(line_acc, zero);
217            let hi = _mm256_extracti128_si256(sad, 1);
218            let lo = _mm256_castsi256_si128(sad);
219            let s = _mm_add_epi64(lo, hi);
220            let h64 = _mm_unpackhi_epi64(s, s);
221            let t = _mm_add_epi64(s, h64);
222            total_lines += _mm_cvtsi128_si64(t) as u64;
223        }
224
225        // Scalar tail using 2-state logic
226        while i < len {
227            let b = *ptr.add(i);
228            if b == b'\n' {
229                total_lines += 1;
230                prev_in_word = false;
231            } else if *BYTE_CLASS_C.get_unchecked(b as usize) == 1 {
232                // Other space byte
233                prev_in_word = false;
234            } else if !prev_in_word {
235                // Word content
236                total_words += 1;
237                prev_in_word = true;
238            }
239            i += 1;
240        }
241    }
242
243    let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] != 1;
244    (total_lines, total_words, first_is_word, prev_in_word)
245}
246
247/// SSE2-accelerated fused line+word counter for C locale chunks.
248/// Same 2-state algorithm as AVX2 but processes 16 bytes per iteration.
249/// Space bytes: 0x09-0x0D, 0x20, 0xA0 (NUL is word content). Available on all x86_64 CPUs.
250#[cfg(target_arch = "x86_64")]
251#[target_feature(enable = "sse2")]
252unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
253    use std::arch::x86_64::*;
254
255    let len = data.len();
256    let ptr = data.as_ptr();
257    let mut i = 0usize;
258    let mut total_lines = 0u64;
259    let mut total_words = 0u64;
260    let mut prev_in_word = false;
261
262    unsafe {
263        let nl_byte = _mm_set1_epi8(b'\n' as i8);
264        let zero = _mm_setzero_si128();
265        let ones = _mm_set1_epi8(1);
266        // Space detection: 0x09-0x0D, 0x20, and 0xA0 (GNU wc C locale); NUL is word content
267        let space_char = _mm_set1_epi8(0x20i8);
268        let tab_lo = _mm_set1_epi8(0x08i8);
269        let tab_hi = _mm_set1_epi8(0x0Ei8);
270        let nbsp_char = _mm_set1_epi8(0xA0u8 as i8);
271
272        let mut line_acc = _mm_setzero_si128();
273        let mut batch = 0u32;
274
275        while i + 16 <= len {
276            let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
277            let is_nl = _mm_cmpeq_epi8(v, nl_byte);
278            line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
279
280            // is_space = (v == 0x20) | (v == 0xA0) | (v > 0x08 && v < 0x0E)
281            let is_sp = _mm_cmpeq_epi8(v, space_char);
282            let is_nbsp = _mm_cmpeq_epi8(v, nbsp_char);
283            let gt_08 = _mm_cmpgt_epi8(v, tab_lo);
284            let lt_0e = _mm_cmpgt_epi8(tab_hi, v);
285            let is_tab_range = _mm_and_si128(gt_08, lt_0e);
286            let is_space = _mm_or_si128(_mm_or_si128(is_sp, is_nbsp), is_tab_range);
287
288            let space_mask = _mm_movemask_epi8(is_space) as u32;
289            // Word content = NOT space (only 16 bits relevant)
290            let word_mask = (!space_mask) & 0xFFFF;
291
292            // 2-state bitmask: count transitions from non-word to word
293            let prev_mask = (word_mask << 1) | (prev_in_word as u32);
294            total_words += (word_mask & !prev_mask).count_ones() as u64;
295            prev_in_word = (word_mask >> 15) & 1 == 1;
296
297            batch += 1;
298            if batch >= 255 {
299                let sad = _mm_sad_epu8(line_acc, zero);
300                let hi = _mm_unpackhi_epi64(sad, sad);
301                let t = _mm_add_epi64(sad, hi);
302                total_lines += _mm_cvtsi128_si64(t) as u64;
303                line_acc = _mm_setzero_si128();
304                batch = 0;
305            }
306            i += 16;
307        }
308
309        if batch > 0 {
310            let sad = _mm_sad_epu8(line_acc, zero);
311            let hi = _mm_unpackhi_epi64(sad, sad);
312            let t = _mm_add_epi64(sad, hi);
313            total_lines += _mm_cvtsi128_si64(t) as u64;
314        }
315
316        // Scalar tail using 2-state logic
317        while i < len {
318            let b = *ptr.add(i);
319            if b == b'\n' {
320                total_lines += 1;
321                prev_in_word = false;
322            } else if *BYTE_CLASS_C.get_unchecked(b as usize) == 1 {
323                prev_in_word = false;
324            } else if !prev_in_word {
325                total_words += 1;
326                prev_in_word = true;
327            }
328            i += 1;
329        }
330    }
331
332    let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] != 1;
333    (total_lines, total_words, first_is_word, prev_in_word)
334}
335
336/// Dispatch to AVX2, SSE2, or scalar chunk counter.
337#[inline]
338fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
339    #[cfg(target_arch = "x86_64")]
340    {
341        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
342            return unsafe { count_lw_c_chunk_avx2(data) };
343        }
344        if data.len() >= 32 {
345            return unsafe { count_lw_c_chunk_sse2(data) };
346        }
347    }
348    count_lw_c_chunk(data)
349}
350
351/// Count words + lines in a C locale chunk using 2-state logic, returning
352/// counts plus boundary info for parallel chunk merging.
353/// Returns (line_count, word_count, first_is_word_content, ends_in_word).
354/// GNU wc: whitespace (0x09-0x0D, 0x20) breaks words; everything else is word content.
355fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
356    let mut lines = 0u64;
357    let mut words = 0u64;
358    let mut in_word = false;
359    let mut i = 0;
360    let len = data.len();
361
362    // Determine first byte's classification for boundary merging
363    let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] != 1;
364
365    while i < len {
366        let b = unsafe { *data.get_unchecked(i) };
367        let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
368        if class == 1 {
369            // Space byte — break word
370            if b == b'\n' {
371                lines += 1;
372            }
373            in_word = false;
374        } else if !in_word {
375            // Word content (any non-space byte)
376            in_word = true;
377            words += 1;
378        }
379        i += 1;
380    }
381    (lines, words, first_is_word, in_word)
382}
383
384/// Count words in UTF-8 locale using 2-state logic matching GNU wc.
385///
386/// Handles:
387/// - ASCII spaces (0x09-0x0D, 0x20): word break
388/// - All other bytes: word content (including NUL, control chars, high bytes)
389/// - Valid UTF-8 multi-byte Unicode spaces: word break
390/// - Everything else: word content
391///
392/// Optimized with ASCII run skipping: when inside a word of printable ASCII,
393/// skips remaining non-space ASCII bytes without per-byte table lookups.
394fn count_words_utf8(data: &[u8]) -> u64 {
395    let mut words = 0u64;
396    let mut in_word = false;
397    let mut i = 0;
398    let len = data.len();
399
400    while i < len {
401        let b = unsafe { *data.get_unchecked(i) };
402
403        if b < 0x80 {
404            // ASCII byte — use table lookup
405            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
406            if class == 1 {
407                in_word = false;
408            } else if !in_word {
409                in_word = true;
410                words += 1;
411            }
412            i += 1;
413        } else if b < 0xC2 {
414            // Invalid UTF-8 start / continuation byte — word content
415            if !in_word {
416                in_word = true;
417                words += 1;
418            }
419            i += 1;
420        } else if b < 0xE0 {
421            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
422                let cp = ((b as u32 & 0x1F) << 6)
423                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
424                if is_unicode_space(cp) {
425                    in_word = false;
426                } else if !in_word {
427                    in_word = true;
428                    words += 1;
429                }
430                i += 2;
431            } else {
432                // Invalid sequence — word content
433                if !in_word {
434                    in_word = true;
435                    words += 1;
436                }
437                i += 1;
438            }
439        } else if b < 0xF0 {
440            if i + 2 < len
441                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
442                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
443            {
444                let cp = ((b as u32 & 0x0F) << 12)
445                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
446                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
447                if is_unicode_space(cp) {
448                    in_word = false;
449                } else if !in_word {
450                    in_word = true;
451                    words += 1;
452                }
453                i += 3;
454            } else {
455                if !in_word {
456                    in_word = true;
457                    words += 1;
458                }
459                i += 1;
460            }
461        } else if b < 0xF5 {
462            if i + 3 < len
463                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
464                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
465                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
466            {
467                let cp = ((b as u32 & 0x07) << 18)
468                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
469                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
470                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
471                if is_unicode_space(cp) {
472                    in_word = false;
473                } else if !in_word {
474                    in_word = true;
475                    words += 1;
476                }
477                i += 4;
478            } else {
479                if !in_word {
480                    in_word = true;
481                    words += 1;
482                }
483                i += 1;
484            }
485        } else {
486            // Invalid byte >= 0xF5 — word content
487            if !in_word {
488                in_word = true;
489                words += 1;
490            }
491            i += 1;
492        }
493    }
494
495    words
496}
497
498/// Count lines and words using optimized strategies per locale.
499/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
500/// C locale: AVX2 SIMD fused counter when available, scalar fallback otherwise.
501pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
502    if utf8 {
503        count_lines_words_utf8_fused(data)
504    } else {
505        let (lines, words, _, _) = count_lw_c_chunk_fast(data);
506        (lines, words)
507    }
508}
509
510/// Fused lines+words counting in UTF-8 mode (single pass).
511/// Avoids separate memchr pass for newlines by counting them inline with words.
512/// Uses 2-state logic: whitespace breaks words, everything else is word content.
513fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
514    let mut lines = 0u64;
515    let mut words = 0u64;
516    let mut in_word = false;
517    let mut i = 0;
518    let len = data.len();
519
520    while i < len {
521        let b = unsafe { *data.get_unchecked(i) };
522
523        if b == b'\n' {
524            lines += 1;
525            in_word = false;
526            i += 1;
527        } else if b < 0x80 {
528            // ASCII byte — use table lookup
529            let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
530            if class == 1 {
531                in_word = false;
532            } else if !in_word {
533                in_word = true;
534                words += 1;
535            }
536            i += 1;
537        } else if b < 0xC2 {
538            // Invalid UTF-8 start / continuation byte — word content
539            if !in_word {
540                in_word = true;
541                words += 1;
542            }
543            i += 1;
544        } else if b < 0xE0 {
545            if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
546                let cp = ((b as u32 & 0x1F) << 6)
547                    | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
548                if is_unicode_space(cp) {
549                    in_word = false;
550                } else if !in_word {
551                    in_word = true;
552                    words += 1;
553                }
554                i += 2;
555            } else {
556                if !in_word {
557                    in_word = true;
558                    words += 1;
559                }
560                i += 1;
561            }
562        } else if b < 0xF0 {
563            if i + 2 < len
564                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
565                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
566            {
567                let cp = ((b as u32 & 0x0F) << 12)
568                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
569                    | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
570                if is_unicode_space(cp) {
571                    in_word = false;
572                } else if !in_word {
573                    in_word = true;
574                    words += 1;
575                }
576                i += 3;
577            } else {
578                if !in_word {
579                    in_word = true;
580                    words += 1;
581                }
582                i += 1;
583            }
584        } else if b < 0xF5 {
585            if i + 3 < len
586                && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
587                && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
588                && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
589            {
590                let cp = ((b as u32 & 0x07) << 18)
591                    | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
592                    | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
593                    | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
594                if is_unicode_space(cp) {
595                    in_word = false;
596                } else if !in_word {
597                    in_word = true;
598                    words += 1;
599                }
600                i += 4;
601            } else {
602                if !in_word {
603                    in_word = true;
604                    words += 1;
605                }
606                i += 1;
607            }
608        } else {
609            // Invalid byte >= 0xF5 — word content
610            if !in_word {
611                in_word = true;
612                words += 1;
613            }
614            i += 1;
615        }
616    }
617
618    (lines, words)
619}
620
621/// Count lines, words, and chars using optimized strategies per locale.
622pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
623    if utf8 {
624        // Fused single-pass for lines+words, then fast char-counting pass
625        let (lines, words) = count_lines_words_utf8_fused(data);
626        let chars = count_chars_utf8(data);
627        (lines, words, chars)
628    } else {
629        // C locale: use optimized fused lines+words, chars = byte count
630        let (lines, words) = count_lines_words(data, false);
631        (lines, words, data.len() as u64)
632    }
633}
634
635/// Count UTF-8 characters by counting non-continuation bytes.
636/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
637/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
638///
639/// Uses AVX2 SIMD on x86_64 for ~32 bytes per cycle throughput.
640/// Falls back to 64-byte block processing with popcount on other architectures.
641pub fn count_chars_utf8(data: &[u8]) -> u64 {
642    #[cfg(target_arch = "x86_64")]
643    {
644        if is_x86_feature_detected!("avx2") {
645            return unsafe { count_chars_utf8_avx2(data) };
646        }
647    }
648    count_chars_utf8_scalar(data)
649}
650
651/// AVX2 SIMD character counter: counts non-continuation bytes using
652/// vectorized AND+CMP with batched horizontal reduction via PSADBW.
653/// Processes 32 bytes per ~3 instructions, with horizontal sum every 255 iterations.
654#[cfg(target_arch = "x86_64")]
655#[target_feature(enable = "avx2")]
656unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
657    unsafe {
658        use std::arch::x86_64::*;
659
660        let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
661        let val_80 = _mm256_set1_epi8(0x80u8 as i8);
662        let ones = _mm256_set1_epi8(1);
663        let zero = _mm256_setzero_si256();
664
665        let mut total = 0u64;
666        let len = data.len();
667        let ptr = data.as_ptr();
668        let mut i = 0;
669        let mut acc = _mm256_setzero_si256();
670        let mut batch = 0u32;
671
672        while i + 32 <= len {
673            let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
674            let masked = _mm256_and_si256(v, mask_c0);
675            let is_cont = _mm256_cmpeq_epi8(masked, val_80);
676            let non_cont = _mm256_andnot_si256(is_cont, ones);
677            acc = _mm256_add_epi8(acc, non_cont);
678
679            batch += 1;
680            if batch >= 255 {
681                // Horizontal sum via PSADBW: sum u8 differences against zero
682                let sad = _mm256_sad_epu8(acc, zero);
683                let hi = _mm256_extracti128_si256(sad, 1);
684                let lo = _mm256_castsi256_si128(sad);
685                let sum = _mm_add_epi64(lo, hi);
686                let hi64 = _mm_unpackhi_epi64(sum, sum);
687                let t = _mm_add_epi64(sum, hi64);
688                total += _mm_cvtsi128_si64(t) as u64;
689                acc = _mm256_setzero_si256();
690                batch = 0;
691            }
692            i += 32;
693        }
694
695        // Final horizontal sum
696        if batch > 0 {
697            let sad = _mm256_sad_epu8(acc, zero);
698            let hi = _mm256_extracti128_si256(sad, 1);
699            let lo = _mm256_castsi256_si128(sad);
700            let sum = _mm_add_epi64(lo, hi);
701            let hi64 = _mm_unpackhi_epi64(sum, sum);
702            let t = _mm_add_epi64(sum, hi64);
703            total += _mm_cvtsi128_si64(t) as u64;
704        }
705
706        while i < len {
707            total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
708            i += 1;
709        }
710
711        total
712    }
713}
714
715/// Scalar fallback for count_chars_utf8.
716fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
717    let mut count = 0u64;
718    let chunks = data.chunks_exact(64);
719    let remainder = chunks.remainder();
720
721    for chunk in chunks {
722        // Fast path: if all bytes are ASCII (< 0x80), every byte is a character
723        let mut any_high = 0u8;
724        let mut i = 0;
725        while i + 8 <= 64 {
726            unsafe {
727                any_high |= *chunk.get_unchecked(i);
728                any_high |= *chunk.get_unchecked(i + 1);
729                any_high |= *chunk.get_unchecked(i + 2);
730                any_high |= *chunk.get_unchecked(i + 3);
731                any_high |= *chunk.get_unchecked(i + 4);
732                any_high |= *chunk.get_unchecked(i + 5);
733                any_high |= *chunk.get_unchecked(i + 6);
734                any_high |= *chunk.get_unchecked(i + 7);
735            }
736            i += 8;
737        }
738        if any_high < 0x80 {
739            count += 64;
740            continue;
741        }
742
743        let mut char_mask = 0u64;
744        i = 0;
745        while i + 7 < 64 {
746            unsafe {
747                char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
748                char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
749                char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
750                char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
751                char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
752                char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
753                char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
754                char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
755            }
756            i += 8;
757        }
758        count += char_mask.count_ones() as u64;
759    }
760
761    for &b in remainder {
762        count += ((b & 0xC0) != 0x80) as u64;
763    }
764    count
765}
766
767/// Count characters in C/POSIX locale (each byte is one character).
768#[inline]
769pub fn count_chars_c(data: &[u8]) -> u64 {
770    data.len() as u64
771}
772
773/// Count characters, choosing behavior based on locale.
774#[inline]
775pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
776    if utf8 {
777        count_chars_utf8(data)
778    } else {
779        count_chars_c(data)
780    }
781}
782
783/// Detect if the current locale uses UTF-8 encoding.
784pub fn is_utf8_locale() -> bool {
785    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
786        if let Ok(val) = std::env::var(var) {
787            if !val.is_empty() {
788                let lower = val.to_ascii_lowercase();
789                return lower.contains("utf-8") || lower.contains("utf8");
790            }
791        }
792    }
793    false
794}
795
796/// Decode one UTF-8 character from a byte slice.
797/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
798#[inline]
799fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
800    let b0 = bytes[0];
801    if b0 < 0x80 {
802        return (b0 as u32, 1);
803    }
804    if b0 < 0xC2 {
805        // Continuation byte or overlong 2-byte — invalid as start
806        return (b0 as u32, 1);
807    }
808    if b0 < 0xE0 {
809        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
810            return (b0 as u32, 1);
811        }
812        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
813        return (cp, 2);
814    }
815    if b0 < 0xF0 {
816        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
817            return (b0 as u32, 1);
818        }
819        let cp =
820            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
821        return (cp, 3);
822    }
823    if b0 < 0xF5 {
824        if bytes.len() < 4
825            || bytes[1] & 0xC0 != 0x80
826            || bytes[2] & 0xC0 != 0x80
827            || bytes[3] & 0xC0 != 0x80
828        {
829            return (b0 as u32, 1);
830        }
831        let cp = ((b0 as u32 & 0x07) << 18)
832            | ((bytes[1] as u32 & 0x3F) << 12)
833            | ((bytes[2] as u32 & 0x3F) << 6)
834            | (bytes[3] as u32 & 0x3F);
835        return (cp, 4);
836    }
837    (b0 as u32, 1)
838}
839
840/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
841/// GNU wc uses wcwidth() which returns 0 for these. We must match.
842#[inline]
843fn is_zero_width(cp: u32) -> bool {
844    matches!(
845        cp,
846        0x0300..=0x036F   // Combining Diacritical Marks
847        | 0x0483..=0x0489 // Cyrillic combining marks
848        | 0x0591..=0x05BD // Hebrew combining marks
849        | 0x05BF
850        | 0x05C1..=0x05C2
851        | 0x05C4..=0x05C5
852        | 0x05C7
853        | 0x0600..=0x0605 // Arabic number signs
854        | 0x0610..=0x061A // Arabic combining marks
855        | 0x064B..=0x065F // Arabic combining marks
856        | 0x0670
857        | 0x06D6..=0x06DD
858        | 0x06DF..=0x06E4
859        | 0x06E7..=0x06E8
860        | 0x06EA..=0x06ED
861        | 0x070F
862        | 0x0711
863        | 0x0730..=0x074A
864        | 0x07A6..=0x07B0
865        | 0x07EB..=0x07F3
866        | 0x07FD
867        | 0x0816..=0x0819
868        | 0x081B..=0x0823
869        | 0x0825..=0x0827
870        | 0x0829..=0x082D
871        | 0x0859..=0x085B
872        | 0x08D3..=0x08E1
873        | 0x08E3..=0x0902
874        | 0x093A
875        | 0x093C
876        | 0x0941..=0x0948
877        | 0x094D
878        | 0x0951..=0x0957
879        | 0x0962..=0x0963
880        | 0x0981
881        | 0x09BC
882        | 0x09C1..=0x09C4
883        | 0x09CD
884        | 0x09E2..=0x09E3
885        | 0x09FE
886        | 0x0A01..=0x0A02
887        | 0x0A3C
888        | 0x0A41..=0x0A42
889        | 0x0A47..=0x0A48
890        | 0x0A4B..=0x0A4D
891        | 0x0A51
892        | 0x0A70..=0x0A71
893        | 0x0A75
894        | 0x0A81..=0x0A82
895        | 0x0ABC
896        | 0x0AC1..=0x0AC5
897        | 0x0AC7..=0x0AC8
898        | 0x0ACD
899        | 0x0AE2..=0x0AE3
900        | 0x0AFA..=0x0AFF
901        | 0x0B01
902        | 0x0B3C
903        | 0x0B3F
904        | 0x0B41..=0x0B44
905        | 0x0B4D
906        | 0x0B56
907        | 0x0B62..=0x0B63
908        | 0x0B82
909        | 0x0BC0
910        | 0x0BCD
911        | 0x0C00
912        | 0x0C04
913        | 0x0C3E..=0x0C40
914        | 0x0C46..=0x0C48
915        | 0x0C4A..=0x0C4D
916        | 0x0C55..=0x0C56
917        | 0x0C62..=0x0C63
918        | 0x0C81
919        | 0x0CBC
920        | 0x0CBF
921        | 0x0CC6
922        | 0x0CCC..=0x0CCD
923        | 0x0CE2..=0x0CE3
924        | 0x0D00..=0x0D01
925        | 0x0D3B..=0x0D3C
926        | 0x0D41..=0x0D44
927        | 0x0D4D
928        | 0x0D62..=0x0D63
929        | 0x0DCA
930        | 0x0DD2..=0x0DD4
931        | 0x0DD6
932        | 0x0E31
933        | 0x0E34..=0x0E3A
934        | 0x0E47..=0x0E4E
935        | 0x0EB1
936        | 0x0EB4..=0x0EBC
937        | 0x0EC8..=0x0ECD
938        | 0x0F18..=0x0F19
939        | 0x0F35
940        | 0x0F37
941        | 0x0F39
942        | 0x0F71..=0x0F7E
943        | 0x0F80..=0x0F84
944        | 0x0F86..=0x0F87
945        | 0x0F8D..=0x0F97
946        | 0x0F99..=0x0FBC
947        | 0x0FC6
948        | 0x102D..=0x1030
949        | 0x1032..=0x1037
950        | 0x1039..=0x103A
951        | 0x103D..=0x103E
952        | 0x1058..=0x1059
953        | 0x105E..=0x1060
954        | 0x1071..=0x1074
955        | 0x1082
956        | 0x1085..=0x1086
957        | 0x108D
958        | 0x109D
959        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
960        | 0x135D..=0x135F
961        | 0x1712..=0x1714
962        | 0x1732..=0x1734
963        | 0x1752..=0x1753
964        | 0x1772..=0x1773
965        | 0x17B4..=0x17B5
966        | 0x17B7..=0x17BD
967        | 0x17C6
968        | 0x17C9..=0x17D3
969        | 0x17DD
970        | 0x180B..=0x180D
971        | 0x1885..=0x1886
972        | 0x18A9
973        | 0x1920..=0x1922
974        | 0x1927..=0x1928
975        | 0x1932
976        | 0x1939..=0x193B
977        | 0x1A17..=0x1A18
978        | 0x1A1B
979        | 0x1A56
980        | 0x1A58..=0x1A5E
981        | 0x1A60
982        | 0x1A62
983        | 0x1A65..=0x1A6C
984        | 0x1A73..=0x1A7C
985        | 0x1A7F
986        | 0x1AB0..=0x1ABE
987        | 0x1B00..=0x1B03
988        | 0x1B34
989        | 0x1B36..=0x1B3A
990        | 0x1B3C
991        | 0x1B42
992        | 0x1B6B..=0x1B73
993        | 0x1B80..=0x1B81
994        | 0x1BA2..=0x1BA5
995        | 0x1BA8..=0x1BA9
996        | 0x1BAB..=0x1BAD
997        | 0x1BE6
998        | 0x1BE8..=0x1BE9
999        | 0x1BED
1000        | 0x1BEF..=0x1BF1
1001        | 0x1C2C..=0x1C33
1002        | 0x1C36..=0x1C37
1003        | 0x1CD0..=0x1CD2
1004        | 0x1CD4..=0x1CE0
1005        | 0x1CE2..=0x1CE8
1006        | 0x1CED
1007        | 0x1CF4
1008        | 0x1CF8..=0x1CF9
1009        | 0x1DC0..=0x1DF9
1010        | 0x1DFB..=0x1DFF
1011        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
1012        | 0x202A..=0x202E // Bidi control chars
1013        | 0x2060..=0x2064 // Word joiner, invisible operators
1014        | 0x2066..=0x206F // Bidi isolates
1015        | 0x20D0..=0x20F0 // Combining marks for symbols
1016        | 0xFE00..=0xFE0F // Variation Selectors
1017        | 0xFE20..=0xFE2F // Combining Half Marks
1018        | 0xFEFF          // Zero Width No-Break Space (BOM)
1019        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
1020        | 0x1D167..=0x1D169
1021        | 0x1D173..=0x1D182
1022        | 0x1D185..=0x1D18B
1023        | 0x1D1AA..=0x1D1AD
1024        | 0x1D242..=0x1D244
1025        | 0xE0001
1026        | 0xE0020..=0xE007F
1027        | 0xE0100..=0xE01EF // Variation Selectors Supplement
1028    )
1029}
1030
1031/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
1032/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
1033#[inline]
1034fn is_wide_char(cp: u32) -> bool {
1035    matches!(
1036        cp,
1037        0x1100..=0x115F   // Hangul Jamo
1038        | 0x231A..=0x231B // Watch, Hourglass
1039        | 0x2329..=0x232A // Angle Brackets
1040        | 0x23E9..=0x23F3 // Various symbols
1041        | 0x23F8..=0x23FA
1042        | 0x25FD..=0x25FE
1043        | 0x2614..=0x2615
1044        | 0x2648..=0x2653
1045        | 0x267F
1046        | 0x2693
1047        | 0x26A1
1048        | 0x26AA..=0x26AB
1049        | 0x26BD..=0x26BE
1050        | 0x26C4..=0x26C5
1051        | 0x26CE
1052        | 0x26D4
1053        | 0x26EA
1054        | 0x26F2..=0x26F3
1055        | 0x26F5
1056        | 0x26FA
1057        | 0x26FD
1058        | 0x2702
1059        | 0x2705
1060        | 0x2708..=0x270D
1061        | 0x270F
1062        | 0x2712
1063        | 0x2714
1064        | 0x2716
1065        | 0x271D
1066        | 0x2721
1067        | 0x2728
1068        | 0x2733..=0x2734
1069        | 0x2744
1070        | 0x2747
1071        | 0x274C
1072        | 0x274E
1073        | 0x2753..=0x2755
1074        | 0x2757
1075        | 0x2763..=0x2764
1076        | 0x2795..=0x2797
1077        | 0x27A1
1078        | 0x27B0
1079        | 0x27BF
1080        | 0x2934..=0x2935
1081        | 0x2B05..=0x2B07
1082        | 0x2B1B..=0x2B1C
1083        | 0x2B50
1084        | 0x2B55
1085        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
1086        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
1087        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
1088        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
1089        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
1090        | 0xAC00..=0xD7A3  // Hangul Syllables
1091        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
1092        | 0xFE10..=0xFE19  // Vertical Forms
1093        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
1094        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
1095        | 0xFFE0..=0xFFE6  // Fullwidth Signs
1096        | 0x1F004
1097        | 0x1F0CF
1098        | 0x1F170..=0x1F171
1099        | 0x1F17E..=0x1F17F
1100        | 0x1F18E
1101        | 0x1F191..=0x1F19A
1102        | 0x1F1E0..=0x1F1FF // Regional Indicators
1103        | 0x1F200..=0x1F202
1104        | 0x1F210..=0x1F23B
1105        | 0x1F240..=0x1F248
1106        | 0x1F250..=0x1F251
1107        | 0x1F260..=0x1F265
1108        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
1109        | 0x1F680..=0x1F6FF // Transport Symbols
1110        | 0x1F900..=0x1F9FF // Supplemental Symbols
1111        | 0x1FA00..=0x1FA6F
1112        | 0x1FA70..=0x1FAFF
1113        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
1114        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
1115    )
1116}
1117
1118/// Compute maximum display width of any line (C/POSIX locale).
1119///
1120/// GNU wc -L behavior in C locale:
1121/// - `\n`: line terminator (records max, resets position)
1122/// - `\t`: advances to next tab stop (multiple of 8)
1123/// - `\r`: carriage return (resets position to 0, same line)
1124/// - `\f`: form feed (acts as line terminator like \n)
1125/// - Printable ASCII (0x20..0x7E): width 1
1126/// - Everything else (controls, high bytes): width 0
1127///
1128/// Optimized with printable ASCII run counting: for runs of bytes in
1129/// 0x21-0x7E (no space/tab/newline), counts the entire run length at once.
1130pub fn max_line_length_c(data: &[u8]) -> u64 {
1131    let mut max_len: u64 = 0;
1132    let mut line_len: u64 = 0;
1133    let mut linepos: u64 = 0;
1134    let mut i = 0;
1135    let len = data.len();
1136
1137    while i < len {
1138        let b = unsafe { *data.get_unchecked(i) };
1139        if b >= 0x21 && b <= 0x7E {
1140            // Printable non-space ASCII — count run length
1141            i += 1;
1142            let mut run = 1u64;
1143            while i < len {
1144                let b = unsafe { *data.get_unchecked(i) };
1145                if b >= 0x21 && b <= 0x7E {
1146                    run += 1;
1147                    i += 1;
1148                } else {
1149                    break;
1150                }
1151            }
1152            linepos += run;
1153            if linepos > line_len {
1154                line_len = linepos;
1155            }
1156        } else {
1157            match b {
1158                b' ' => {
1159                    linepos += 1;
1160                    if linepos > line_len {
1161                        line_len = linepos;
1162                    }
1163                }
1164                b'\n' => {
1165                    if line_len > max_len {
1166                        max_len = line_len;
1167                    }
1168                    linepos = 0;
1169                    line_len = 0;
1170                }
1171                b'\t' => {
1172                    linepos = (linepos + 8) & !7;
1173                    if linepos > line_len {
1174                        line_len = linepos;
1175                    }
1176                }
1177                b'\r' => {
1178                    linepos = 0;
1179                }
1180                0x0C => {
1181                    if line_len > max_len {
1182                        max_len = line_len;
1183                    }
1184                    linepos = 0;
1185                    line_len = 0;
1186                }
1187                _ => {} // Non-printable: width 0
1188            }
1189            i += 1;
1190        }
1191    }
1192
1193    if line_len > max_len {
1194        max_len = line_len;
1195    }
1196
1197    max_len
1198}
1199
1200/// Compute maximum display width of any line (UTF-8 locale).
1201///
1202/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
1203/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
1204///
1205/// Optimized with printable ASCII run counting for common text.
1206pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1207    let mut max_len: u64 = 0;
1208    let mut line_len: u64 = 0;
1209    let mut linepos: u64 = 0;
1210    let mut i = 0;
1211    let len = data.len();
1212
1213    while i < len {
1214        let b = unsafe { *data.get_unchecked(i) };
1215
1216        if b >= 0x21 && b <= 0x7E {
1217            // Printable non-space ASCII (most common) — count run length
1218            i += 1;
1219            let mut run = 1u64;
1220            while i < len {
1221                let b = unsafe { *data.get_unchecked(i) };
1222                if b >= 0x21 && b <= 0x7E {
1223                    run += 1;
1224                    i += 1;
1225                } else {
1226                    break;
1227                }
1228            }
1229            linepos += run;
1230            if linepos > line_len {
1231                line_len = linepos;
1232            }
1233        } else if b < 0x80 {
1234            // Other ASCII: space, tab, newline, controls
1235            match b {
1236                b' ' => {
1237                    linepos += 1;
1238                    if linepos > line_len {
1239                        line_len = linepos;
1240                    }
1241                }
1242                b'\n' => {
1243                    if line_len > max_len {
1244                        max_len = line_len;
1245                    }
1246                    linepos = 0;
1247                    line_len = 0;
1248                }
1249                b'\t' => {
1250                    linepos = (linepos + 8) & !7;
1251                    if linepos > line_len {
1252                        line_len = linepos;
1253                    }
1254                }
1255                b'\r' => {
1256                    linepos = 0;
1257                }
1258                0x0C => {
1259                    if line_len > max_len {
1260                        max_len = line_len;
1261                    }
1262                    linepos = 0;
1263                    line_len = 0;
1264                }
1265                _ => {} // Non-printable: width 0
1266            }
1267            i += 1;
1268        } else {
1269            // Multibyte UTF-8
1270            let (cp, len) = decode_utf8(&data[i..]);
1271
1272            // C1 control characters (0x80..0x9F): non-printable, width 0
1273            if cp <= 0x9F {
1274                // width 0
1275            } else if is_zero_width(cp) {
1276                // Combining marks, zero-width chars: width 0
1277            } else if is_wide_char(cp) {
1278                linepos += 2;
1279                if linepos > line_len {
1280                    line_len = linepos;
1281                }
1282            } else {
1283                // Regular printable Unicode character: width 1
1284                linepos += 1;
1285                if linepos > line_len {
1286                    line_len = linepos;
1287                }
1288            }
1289            i += len;
1290        }
1291    }
1292
1293    // Handle last line
1294    if line_len > max_len {
1295        max_len = line_len;
1296    }
1297
1298    max_len
1299}
1300
1301/// Compute maximum display width, choosing behavior based on locale.
1302#[inline]
1303pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1304    if utf8 {
1305        max_line_length_utf8(data)
1306    } else {
1307        max_line_length_c(data)
1308    }
1309}
1310
1311/// Count all metrics using optimized individual passes.
1312///
1313/// Each metric uses its own optimized algorithm:
1314/// - Lines: SIMD-accelerated memchr
1315/// - Words: 3-state scalar/state-machine (locale-dependent)
1316/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1317/// - Max line length: locale-aware display width tracking
1318///
1319/// Multi-pass is faster than single-pass because each pass has a tight,
1320/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1321/// making subsequent passes nearly free for memory bandwidth.
1322pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1323    if utf8 {
1324        let (lines, words) = count_lines_words_utf8_fused(data);
1325        WcCounts {
1326            lines,
1327            words,
1328            bytes: data.len() as u64,
1329            chars: count_chars_utf8(data),
1330            max_line_length: max_line_length_utf8(data),
1331        }
1332    } else {
1333        WcCounts {
1334            lines: count_lines(data),
1335            words: count_words_locale(data, false),
1336            bytes: data.len() as u64,
1337            chars: data.len() as u64,
1338            max_line_length: max_line_length_c(data),
1339        }
1340    }
1341}
1342
1343/// Quick check if data is likely all-ASCII by sampling three regions.
1344/// Checks first 256 bytes, middle 256 bytes, and last 256 bytes.
1345/// If any byte >= 0x80 is found, returns false.
1346#[inline]
1347fn check_ascii_sample(data: &[u8]) -> bool {
1348    let len = data.len();
1349    if len == 0 {
1350        return true;
1351    }
1352
1353    // Check in 8-byte blocks using OR-accumulation for speed
1354    let check_region = |start: usize, end: usize| -> bool {
1355        let mut or_acc = 0u8;
1356        let region = &data[start..end];
1357        let mut i = 0;
1358        while i + 8 <= region.len() {
1359            unsafe {
1360                or_acc |= *region.get_unchecked(i);
1361                or_acc |= *region.get_unchecked(i + 1);
1362                or_acc |= *region.get_unchecked(i + 2);
1363                or_acc |= *region.get_unchecked(i + 3);
1364                or_acc |= *region.get_unchecked(i + 4);
1365                or_acc |= *region.get_unchecked(i + 5);
1366                or_acc |= *region.get_unchecked(i + 6);
1367                or_acc |= *region.get_unchecked(i + 7);
1368            }
1369            i += 8;
1370        }
1371        while i < region.len() {
1372            or_acc |= region[i];
1373            i += 1;
1374        }
1375        or_acc < 0x80
1376    };
1377
1378    let sample = 256.min(len);
1379
1380    // Check beginning
1381    if !check_region(0, sample) {
1382        return false;
1383    }
1384    // Check middle
1385    if len > sample * 2 {
1386        let mid = len / 2;
1387        let mid_start = mid.saturating_sub(sample / 2);
1388        if !check_region(mid_start, (mid_start + sample).min(len)) {
1389            return false;
1390        }
1391    }
1392    // Check end
1393    if len > sample {
1394        if !check_region(len - sample, len) {
1395            return false;
1396        }
1397    }
1398
1399    true
1400}
1401
1402// ──────────────────────────────────────────────────
1403// Parallel counting for large files
1404// ──────────────────────────────────────────────────
1405
1406/// Split data into chunks at newline boundaries for parallel processing.
1407/// Returns slices where each slice (except possibly the last) ends with `\n`.
1408/// Splitting at newlines guarantees word boundaries in any locale,
1409/// enabling safe parallel word counting without boundary adjustment.
1410fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1411    if data.is_empty() || num_chunks <= 1 {
1412        return vec![data];
1413    }
1414    let chunk_size = data.len() / num_chunks;
1415    let mut chunks = Vec::with_capacity(num_chunks);
1416    let mut pos = 0;
1417
1418    for _ in 0..num_chunks - 1 {
1419        let target = pos + chunk_size;
1420        if target >= data.len() {
1421            break;
1422        }
1423        let boundary = memchr::memchr(b'\n', &data[target..])
1424            .map(|p| target + p + 1)
1425            .unwrap_or(data.len());
1426        if boundary > pos {
1427            chunks.push(&data[pos..boundary]);
1428        }
1429        pos = boundary;
1430    }
1431    if pos < data.len() {
1432        chunks.push(&data[pos..]);
1433    }
1434    chunks
1435}
1436
1437/// Count newlines in parallel using SIMD memchr + rayon.
1438/// Each thread gets at least 1MB (to amortize rayon scheduling overhead).
1439pub fn count_lines_parallel(data: &[u8]) -> u64 {
1440    if data.len() < PARALLEL_THRESHOLD {
1441        return count_lines(data);
1442    }
1443
1444    let num_threads = rayon::current_num_threads().max(1);
1445    // Ensure chunks are large enough to amortize SIMD setup overhead
1446    let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1447
1448    data.par_chunks(chunk_size)
1449        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1450        .sum()
1451}
1452
1453/// Count words in parallel with boundary adjustment.
1454pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1455    if data.len() < PARALLEL_THRESHOLD {
1456        return count_words_locale(data, utf8);
1457    }
1458
1459    let num_threads = rayon::current_num_threads().max(1);
1460
1461    if utf8 {
1462        // UTF-8: split at newline boundaries for safe parallel word counting.
1463        // Newlines are always word boundaries, so no boundary adjustment needed.
1464        let chunks = split_at_newlines(data, num_threads);
1465        chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1466    } else {
1467        // C locale: parallel 3-state word counting with boundary adjustment
1468        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1469
1470        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1471
1472        // Each chunk returns (lines, word_count, first_active_is_printable, ends_in_word)
1473        let results: Vec<(u64, u64, bool, bool)> = chunks
1474            .par_iter()
1475            .map(|chunk| count_lw_c_chunk(chunk))
1476            .collect();
1477
1478        let mut total = 0u64;
1479        for i in 0..results.len() {
1480            total += results[i].1;
1481            // Boundary adjustment: if previous chunk ended in_word AND
1482            // current chunk's first non-transparent byte is printable,
1483            // the word was split across chunks — subtract the overcount.
1484            if i > 0 && results[i - 1].3 && results[i].2 {
1485                total -= 1;
1486            }
1487        }
1488        total
1489    }
1490}
1491
1492/// Count UTF-8 characters in parallel.
1493pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1494    if !utf8 {
1495        return data.len() as u64;
1496    }
1497    if data.len() < PARALLEL_THRESHOLD {
1498        return count_chars_utf8(data);
1499    }
1500
1501    let num_threads = rayon::current_num_threads().max(1);
1502    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1503
1504    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1505}
1506
1507/// Count lines + words + bytes in a single fused pass (the default wc mode).
1508/// Avoids separate passes entirely — combines newline counting with word detection.
1509pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1510    let (lines, words) = count_lines_words(data, utf8);
1511    (lines, words, data.len() as u64)
1512}
1513
1514/// Parallel counting of lines + words + bytes only (no chars).
1515/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1516/// C locale: single fused pass per chunk counts BOTH lines and words.
1517/// UTF-8: checks ASCII first for C locale fast path, else splits at newlines
1518/// for safe parallel UTF-8 word counting.
1519pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1520    if data.len() < PARALLEL_THRESHOLD {
1521        // Small file: use fused single-pass
1522        return count_lwb(data, utf8);
1523    }
1524
1525    let num_threads = rayon::current_num_threads().max(1);
1526
1527    let (lines, words) = if !utf8 {
1528        // C locale: FUSED parallel lines+words counting — single pass per chunk
1529        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1530
1531        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1532        let results: Vec<(u64, u64, bool, bool)> = chunks
1533            .par_iter()
1534            .map(|chunk| count_lw_c_chunk_fast(chunk))
1535            .collect();
1536
1537        let mut line_total = 0u64;
1538        let mut word_total = 0u64;
1539        for i in 0..results.len() {
1540            line_total += results[i].0;
1541            word_total += results[i].1;
1542            if i > 0 && results[i - 1].3 && results[i].2 {
1543                word_total -= 1;
1544            }
1545        }
1546
1547        (line_total, word_total)
1548    } else {
1549        // UTF-8 locale: check if ASCII for faster C locale path
1550        let is_ascii = check_ascii_sample(data);
1551        if is_ascii {
1552            // Pure ASCII: use C locale parallel path (arbitrary chunks OK)
1553            let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1554            let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1555            let results: Vec<(u64, u64, bool, bool)> = chunks
1556                .par_iter()
1557                .map(|chunk| count_lw_c_chunk_fast(chunk))
1558                .collect();
1559
1560            let mut line_total = 0u64;
1561            let mut word_total = 0u64;
1562            for i in 0..results.len() {
1563                line_total += results[i].0;
1564                word_total += results[i].1;
1565                if i > 0 && results[i - 1].3 && results[i].2 {
1566                    word_total -= 1;
1567                }
1568            }
1569            (line_total, word_total)
1570        } else {
1571            // Non-ASCII UTF-8: split at newline boundaries for safe parallel
1572            // word counting. Newlines always break words, so no adjustment needed.
1573            let chunks = split_at_newlines(data, num_threads);
1574            let results: Vec<(u64, u64)> = chunks
1575                .par_iter()
1576                .map(|chunk| count_lines_words_utf8_fused(chunk))
1577                .collect();
1578            let mut line_total = 0u64;
1579            let mut word_total = 0u64;
1580            for (l, w) in results {
1581                line_total += l;
1582                word_total += w;
1583            }
1584            (line_total, word_total)
1585        }
1586    };
1587
1588    (lines, words, data.len() as u64)
1589}
1590
1591/// Combined parallel counting of lines + words + chars.
1592/// UTF-8: splits at newline boundaries for fused lines+words+chars per chunk.
1593/// C locale: fused parallel lines+words with boundary adjustment + parallel chars.
1594pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1595    if data.len() < PARALLEL_THRESHOLD {
1596        let lines = count_lines(data);
1597        let words = count_words_locale(data, utf8);
1598        let chars = count_chars(data, utf8);
1599        return (lines, words, chars);
1600    }
1601
1602    let num_threads = rayon::current_num_threads().max(1);
1603
1604    if utf8 {
1605        // UTF-8: fused parallel lines+words+chars per chunk (split at newlines)
1606        let chunks = split_at_newlines(data, num_threads);
1607        let results: Vec<(u64, u64, u64)> = chunks
1608            .par_iter()
1609            .map(|chunk| {
1610                let (lines, words) = count_lines_words_utf8_fused(chunk);
1611                let chars = count_chars_utf8(chunk);
1612                (lines, words, chars)
1613            })
1614            .collect();
1615        let mut lines = 0u64;
1616        let mut words = 0u64;
1617        let mut chars = 0u64;
1618        for (l, w, c) in results {
1619            lines += l;
1620            words += w;
1621            chars += c;
1622        }
1623        (lines, words, chars)
1624    } else {
1625        // C locale: fused parallel lines+words + parallel chars (= byte count)
1626        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1627        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1628        let results: Vec<(u64, u64, bool, bool)> = chunks
1629            .par_iter()
1630            .map(|chunk| count_lw_c_chunk_fast(chunk))
1631            .collect();
1632        let mut lines = 0u64;
1633        let mut words = 0u64;
1634        for i in 0..results.len() {
1635            lines += results[i].0;
1636            words += results[i].1;
1637            if i > 0 && results[i - 1].3 && results[i].2 {
1638                words -= 1;
1639            }
1640        }
1641        (lines, words, data.len() as u64)
1642    }
1643}
1644
1645/// Parallel max line length computation.
1646/// Splits at newline boundaries so each chunk independently computes correct
1647/// max line width (since newlines reset position tracking).
1648pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1649    if data.len() < PARALLEL_THRESHOLD {
1650        return max_line_length(data, utf8);
1651    }
1652    let num_threads = rayon::current_num_threads().max(1);
1653    let chunks = split_at_newlines(data, num_threads);
1654    chunks
1655        .par_iter()
1656        .map(|chunk| {
1657            if utf8 {
1658                max_line_length_utf8(chunk)
1659            } else {
1660                max_line_length_c(chunk)
1661            }
1662        })
1663        .max()
1664        .unwrap_or(0)
1665}
1666
1667/// Parallel counting of all metrics at once.
1668/// Splits at newline boundaries for safe parallel word + max_line_length counting.
1669/// Each chunk computes all metrics in a single traversal group, maximizing cache reuse.
1670pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1671    if data.len() < PARALLEL_THRESHOLD {
1672        return count_all(data, utf8);
1673    }
1674
1675    let num_threads = rayon::current_num_threads().max(1);
1676    let chunks = split_at_newlines(data, num_threads);
1677
1678    if utf8 {
1679        let results: Vec<(u64, u64, u64, u64)> = chunks
1680            .par_iter()
1681            .map(|chunk| {
1682                let (lines, words) = count_lines_words_utf8_fused(chunk);
1683                let chars = count_chars_utf8(chunk);
1684                let max_ll = max_line_length_utf8(chunk);
1685                (lines, words, chars, max_ll)
1686            })
1687            .collect();
1688
1689        let mut counts = WcCounts {
1690            bytes: data.len() as u64,
1691            ..Default::default()
1692        };
1693        for (l, w, c, m) in results {
1694            counts.lines += l;
1695            counts.words += w;
1696            counts.chars += c;
1697            if m > counts.max_line_length {
1698                counts.max_line_length = m;
1699            }
1700        }
1701        counts
1702    } else {
1703        // C locale: fused lines+words per chunk + max_line_length per chunk
1704        let results: Vec<(u64, u64, u64)> = chunks
1705            .par_iter()
1706            .map(|chunk| {
1707                let (lines, words) = count_lines_words(chunk, false);
1708                let max_ll = max_line_length_c(chunk);
1709                (lines, words, max_ll)
1710            })
1711            .collect();
1712
1713        let mut counts = WcCounts {
1714            bytes: data.len() as u64,
1715            chars: data.len() as u64,
1716            ..Default::default()
1717        };
1718        for (l, w, m) in &results {
1719            counts.lines += l;
1720            counts.words += w;
1721            if *m > counts.max_line_length {
1722                counts.max_line_length = *m;
1723            }
1724        }
1725        counts
1726    }
1727}