coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (2MB).
5/// Lower threshold lets us exploit 4 cores on smaller files.
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Results from counting a byte slice.
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
10pub struct WcCounts {
11    pub lines: u64,
12    pub words: u64,
13    pub bytes: u64,
14    pub chars: u64,
15    pub max_line_length: u64,
16}
17
18// ──────────────────────────────────────────────────
19// 3-state byte classification for word counting
20// ──────────────────────────────────────────────────
21//
22// GNU wc uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
23//   0 = printable (word content): starts or continues a word
24//   1 = space (word break): ends any current word
25//   2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
26//
27// The critical difference from 2-state is that transparent characters
28// (NUL, control chars, invalid UTF-8) do NOT break words.
29// Example: "hello\x00world" is 1 word (NUL is transparent), not 2.
30
31/// 3-state byte classification for C/POSIX locale.
32/// In C locale, mbrtowc() fails for bytes >= 0x80, making them transparent.
33/// Only printable ASCII (0x21-0x7E) forms words.
34const fn make_byte_class_c() -> [u8; 256] {
35    let mut t = [2u8; 256]; // default: transparent
36    // Spaces: iswspace() returns true
37    t[0x09] = 1; // \t
38    t[0x0A] = 1; // \n
39    t[0x0B] = 1; // \v
40    t[0x0C] = 1; // \f
41    t[0x0D] = 1; // \r
42    t[0x20] = 1; // space
43    // Printable ASCII (0x21-0x7E): word content
44    let mut i = 0x21u16;
45    while i <= 0x7E {
46        t[i as usize] = 0;
47        i += 1;
48    }
49    t
50}
51
52const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
53
54/// 3-state single-byte classification for UTF-8 locale.
55/// Multi-byte UTF-8 sequences are handled by the state machine separately.
56const fn make_byte_class_utf8() -> [u8; 256] {
57    let mut t = [2u8; 256]; // default: transparent
58    // Spaces
59    t[0x09] = 1; // \t
60    t[0x0A] = 1; // \n
61    t[0x0B] = 1; // \v
62    t[0x0C] = 1; // \f
63    t[0x0D] = 1; // \r
64    t[0x20] = 1; // space
65    // Printable ASCII (0x21-0x7E): word content
66    let mut i = 0x21u16;
67    while i <= 0x7E {
68        t[i as usize] = 0;
69        i += 1;
70    }
71    t
72}
73
74const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
75
76/// Printable ASCII lookup table: 0x20 (space) through 0x7E (~) are printable.
77const fn make_printable_table() -> [u8; 256] {
78    let mut t = [0u8; 256];
79    let mut i = 0x20u16;
80    while i <= 0x7E {
81        t[i as usize] = 1;
82        i += 1;
83    }
84    t
85}
86
87const PRINTABLE_TABLE: [u8; 256] = make_printable_table();
88
89// ──────────────────────────────────────────────────
90// Unicode character classification helpers
91// ──────────────────────────────────────────────────
92
93/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
94/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
95#[inline]
96fn is_unicode_space(cp: u32) -> bool {
97    matches!(
98        cp,
99        0x00A0 |           // No-Break Space
100        0x1680 |           // Ogham Space Mark
101        0x2000
102            ..=0x200A |  // En Quad through Hair Space
103        0x2028 |           // Line Separator
104        0x2029 |           // Paragraph Separator
105        0x202F |           // Narrow No-Break Space
106        0x205F |           // Medium Mathematical Space
107        0x3000 // Ideographic Space
108    )
109}
110
111/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
112/// C1 control characters (U+0080-U+009F) are not printable.
113/// Most characters >= U+00A0 are printable.
114#[inline]
115fn is_unicode_printable(cp: u32) -> bool {
116    cp >= 0xA0
117}
118
119// ──────────────────────────────────────────────────
120// Core counting functions
121// ──────────────────────────────────────────────────
122
123/// Count newlines using SIMD-accelerated memchr.
124/// GNU wc counts newline bytes (`\n`), not logical lines.
125#[inline]
126pub fn count_lines(data: &[u8]) -> u64 {
127    memchr_iter(b'\n', data).count() as u64
128}
129
130/// Count bytes. Trivial but included for API consistency.
131#[inline]
132pub fn count_bytes(data: &[u8]) -> u64 {
133    data.len() as u64
134}
135
136/// Count words using locale-aware 3-state logic (default: UTF-8).
137pub fn count_words(data: &[u8]) -> u64 {
138    count_words_locale(data, true)
139}
140
141/// Count words with explicit locale control using 3-state logic.
142///
143/// GNU wc classifies each character as:
144///   - space (iswspace=true): sets in_word=false
145///   - printable (iswprint=true): sets in_word=true, increments word count on transition
146///   - transparent (neither): leaves in_word unchanged
147pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
148    if utf8 {
149        count_words_utf8(data)
150    } else {
151        count_words_c(data)
152    }
153}
154
155/// Count words in C/POSIX locale using 3-state scalar logic.
156/// Only printable ASCII (0x21-0x7E) forms words.
157/// Bytes >= 0x80 and non-printable ASCII controls are transparent.
158fn count_words_c(data: &[u8]) -> u64 {
159    let mut words = 0u64;
160    let mut in_word = false;
161    for &b in data {
162        let class = BYTE_CLASS_C[b as usize];
163        if class == 1 {
164            // Space: break word
165            in_word = false;
166        } else if class == 0 {
167            // Printable: start/continue word
168            if !in_word {
169                in_word = true;
170                words += 1;
171            }
172        }
173        // class == 2: transparent — in_word unchanged
174    }
175    words
176}
177
178/// Count words in a C locale chunk, returning word count plus boundary info.
179/// Used by parallel word counting.
180/// Returns (word_count, first_active_is_printable, ends_in_word).
181fn count_words_c_chunk(data: &[u8]) -> (u64, bool, bool) {
182    let mut words = 0u64;
183    let mut in_word = false;
184    let mut first_active_is_printable = false;
185    let mut seen_active = false;
186
187    for &b in data {
188        let class = BYTE_CLASS_C[b as usize];
189        if class == 1 {
190            if !seen_active {
191                seen_active = true;
192                // first_active_is_printable stays false
193            }
194            in_word = false;
195        } else if class == 0 {
196            if !seen_active {
197                seen_active = true;
198                first_active_is_printable = true;
199            }
200            if !in_word {
201                in_word = true;
202                words += 1;
203            }
204        }
205    }
206    (words, first_active_is_printable, in_word)
207}
208
209/// Count words in UTF-8 locale using a state machine with 3-state logic.
210///
211/// Handles:
212/// - ASCII spaces (0x09-0x0D, 0x20): word break
213/// - ASCII printable (0x21-0x7E): word content
214/// - ASCII non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
215/// - Valid UTF-8 multi-byte → check Unicode space/printable
216/// - Invalid UTF-8: transparent (GNU wc skips invalid bytes without changing state)
217fn count_words_utf8(data: &[u8]) -> u64 {
218    let mut words = 0u64;
219    let mut in_word = false;
220    let mut i = 0;
221
222    while i < data.len() {
223        let b = data[i];
224
225        if b < 0x80 {
226            // ASCII: use 3-state lookup table
227            let class = BYTE_CLASS_UTF8[b as usize];
228            if class == 1 {
229                in_word = false;
230            } else if class == 0 {
231                if !in_word {
232                    in_word = true;
233                    words += 1;
234                }
235            }
236            // class == 2: transparent
237            i += 1;
238        } else if b < 0xC2 {
239            // 0x80-0xBF: standalone continuation byte (invalid UTF-8)
240            // 0xC0-0xC1: overlong encoding (invalid UTF-8)
241            // Transparent: don't change in_word
242            i += 1;
243        } else if b < 0xE0 {
244            // 2-byte sequence: need 1 continuation byte
245            if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
246                let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
247                if is_unicode_space(cp) {
248                    in_word = false;
249                } else if is_unicode_printable(cp) {
250                    if !in_word {
251                        in_word = true;
252                        words += 1;
253                    }
254                }
255                // else: non-printable (e.g., C1 controls U+0080-U+009F) → transparent
256                i += 2;
257            } else {
258                // Invalid sequence: transparent
259                i += 1;
260            }
261        } else if b < 0xF0 {
262            // 3-byte sequence: need 2 continuation bytes
263            if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
264                let cp = ((b as u32 & 0x0F) << 12)
265                    | ((data[i + 1] as u32 & 0x3F) << 6)
266                    | (data[i + 2] as u32 & 0x3F);
267                if is_unicode_space(cp) {
268                    in_word = false;
269                } else if is_unicode_printable(cp) {
270                    if !in_word {
271                        in_word = true;
272                        words += 1;
273                    }
274                }
275                i += 3;
276            } else {
277                // Invalid: transparent
278                i += 1;
279            }
280        } else if b < 0xF5 {
281            // 4-byte sequence: need 3 continuation bytes
282            if i + 3 < data.len()
283                && (data[i + 1] & 0xC0) == 0x80
284                && (data[i + 2] & 0xC0) == 0x80
285                && (data[i + 3] & 0xC0) == 0x80
286            {
287                let cp = ((b as u32 & 0x07) << 18)
288                    | ((data[i + 1] as u32 & 0x3F) << 12)
289                    | ((data[i + 2] as u32 & 0x3F) << 6)
290                    | (data[i + 3] as u32 & 0x3F);
291                if is_unicode_space(cp) {
292                    in_word = false;
293                } else if is_unicode_printable(cp) {
294                    if !in_word {
295                        in_word = true;
296                        words += 1;
297                    }
298                }
299                i += 4;
300            } else {
301                // Invalid: transparent
302                i += 1;
303            }
304        } else {
305            // 0xF5-0xFF: invalid UTF-8 — transparent
306            i += 1;
307        }
308    }
309
310    words
311}
312
313/// Count lines and words using optimized strategies per locale.
314/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
315/// C locale: single scalar pass with 3-state logic.
316pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
317    if utf8 {
318        count_lines_words_utf8_fused(data)
319    } else {
320        let mut lines = 0u64;
321        let mut words = 0u64;
322        let mut in_word = false;
323        for &b in data {
324            if b == b'\n' {
325                lines += 1;
326            }
327            let class = BYTE_CLASS_C[b as usize];
328            if class == 1 {
329                in_word = false;
330            } else if class == 0 {
331                if !in_word {
332                    in_word = true;
333                    words += 1;
334                }
335            }
336        }
337        (lines, words)
338    }
339}
340
341/// Fused lines+words counting in UTF-8 mode (single pass).
342/// Avoids separate memchr pass for newlines by counting them inline with words.
343fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
344    let mut lines = 0u64;
345    let mut words = 0u64;
346    let mut in_word = false;
347    let mut i = 0;
348
349    while i < data.len() {
350        let b = data[i];
351
352        if b < 0x80 {
353            // ASCII fast path: combined newline + word counting
354            if b == b'\n' {
355                lines += 1;
356                in_word = false;
357            } else {
358                let class = BYTE_CLASS_UTF8[b as usize];
359                if class == 1 {
360                    in_word = false;
361                } else if class == 0 {
362                    if !in_word {
363                        in_word = true;
364                        words += 1;
365                    }
366                }
367            }
368            i += 1;
369        } else if b < 0xC2 {
370            i += 1;
371        } else if b < 0xE0 {
372            if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
373                let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
374                if is_unicode_space(cp) {
375                    in_word = false;
376                } else if is_unicode_printable(cp) {
377                    if !in_word {
378                        in_word = true;
379                        words += 1;
380                    }
381                }
382                i += 2;
383            } else {
384                i += 1;
385            }
386        } else if b < 0xF0 {
387            if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
388                let cp = ((b as u32 & 0x0F) << 12)
389                    | ((data[i + 1] as u32 & 0x3F) << 6)
390                    | (data[i + 2] as u32 & 0x3F);
391                if is_unicode_space(cp) {
392                    in_word = false;
393                } else if is_unicode_printable(cp) {
394                    if !in_word {
395                        in_word = true;
396                        words += 1;
397                    }
398                }
399                i += 3;
400            } else {
401                i += 1;
402            }
403        } else if b < 0xF5 {
404            if i + 3 < data.len()
405                && (data[i + 1] & 0xC0) == 0x80
406                && (data[i + 2] & 0xC0) == 0x80
407                && (data[i + 3] & 0xC0) == 0x80
408            {
409                let cp = ((b as u32 & 0x07) << 18)
410                    | ((data[i + 1] as u32 & 0x3F) << 12)
411                    | ((data[i + 2] as u32 & 0x3F) << 6)
412                    | (data[i + 3] as u32 & 0x3F);
413                if is_unicode_space(cp) {
414                    in_word = false;
415                } else if is_unicode_printable(cp) {
416                    if !in_word {
417                        in_word = true;
418                        words += 1;
419                    }
420                }
421                i += 4;
422            } else {
423                i += 1;
424            }
425        } else {
426            i += 1;
427        }
428    }
429
430    (lines, words)
431}
432
433/// Count lines, words, and chars using optimized strategies per locale.
434pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
435    if utf8 {
436        // Fused single-pass for lines+words, then fast char-counting pass
437        let (lines, words) = count_lines_words_utf8_fused(data);
438        let chars = count_chars_utf8(data);
439        (lines, words, chars)
440    } else {
441        // C locale: single pass for lines + words, chars = byte count
442        let mut lines = 0u64;
443        let mut words = 0u64;
444        let mut in_word = false;
445        for &b in data {
446            if b == b'\n' {
447                lines += 1;
448            }
449            let class = BYTE_CLASS_C[b as usize];
450            if class == 1 {
451                in_word = false;
452            } else if class == 0 {
453                if !in_word {
454                    in_word = true;
455                    words += 1;
456                }
457            }
458        }
459        (lines, words, data.len() as u64)
460    }
461}
462
463/// Count UTF-8 characters by counting non-continuation bytes.
464/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
465/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
466///
467/// Uses 64-byte block processing with popcount for ~4x throughput vs scalar.
468pub fn count_chars_utf8(data: &[u8]) -> u64 {
469    let mut count = 0u64;
470    let chunks = data.chunks_exact(64);
471    let remainder = chunks.remainder();
472
473    for chunk in chunks {
474        // Build 64-bit mask: bit i = 1 if chunk[i] is NOT a continuation byte
475        let mut char_mask = 0u64;
476        let mut i = 0;
477        while i + 7 < 64 {
478            char_mask |= (((chunk[i] & 0xC0) != 0x80) as u64) << i;
479            char_mask |= (((chunk[i + 1] & 0xC0) != 0x80) as u64) << (i + 1);
480            char_mask |= (((chunk[i + 2] & 0xC0) != 0x80) as u64) << (i + 2);
481            char_mask |= (((chunk[i + 3] & 0xC0) != 0x80) as u64) << (i + 3);
482            char_mask |= (((chunk[i + 4] & 0xC0) != 0x80) as u64) << (i + 4);
483            char_mask |= (((chunk[i + 5] & 0xC0) != 0x80) as u64) << (i + 5);
484            char_mask |= (((chunk[i + 6] & 0xC0) != 0x80) as u64) << (i + 6);
485            char_mask |= (((chunk[i + 7] & 0xC0) != 0x80) as u64) << (i + 7);
486            i += 8;
487        }
488        count += char_mask.count_ones() as u64;
489    }
490
491    for &b in remainder {
492        count += ((b & 0xC0) != 0x80) as u64;
493    }
494    count
495}
496
497/// Count characters in C/POSIX locale (each byte is one character).
498#[inline]
499pub fn count_chars_c(data: &[u8]) -> u64 {
500    data.len() as u64
501}
502
503/// Count characters, choosing behavior based on locale.
504#[inline]
505pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
506    if utf8 {
507        count_chars_utf8(data)
508    } else {
509        count_chars_c(data)
510    }
511}
512
513/// Detect if the current locale uses UTF-8 encoding.
514pub fn is_utf8_locale() -> bool {
515    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
516        if let Ok(val) = std::env::var(var) {
517            if !val.is_empty() {
518                let lower = val.to_ascii_lowercase();
519                return lower.contains("utf-8") || lower.contains("utf8");
520            }
521        }
522    }
523    false
524}
525
526/// Decode one UTF-8 character from a byte slice.
527/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
528#[inline]
529fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
530    let b0 = bytes[0];
531    if b0 < 0x80 {
532        return (b0 as u32, 1);
533    }
534    if b0 < 0xC2 {
535        // Continuation byte or overlong 2-byte — invalid as start
536        return (b0 as u32, 1);
537    }
538    if b0 < 0xE0 {
539        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
540            return (b0 as u32, 1);
541        }
542        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
543        return (cp, 2);
544    }
545    if b0 < 0xF0 {
546        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
547            return (b0 as u32, 1);
548        }
549        let cp =
550            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
551        return (cp, 3);
552    }
553    if b0 < 0xF5 {
554        if bytes.len() < 4
555            || bytes[1] & 0xC0 != 0x80
556            || bytes[2] & 0xC0 != 0x80
557            || bytes[3] & 0xC0 != 0x80
558        {
559            return (b0 as u32, 1);
560        }
561        let cp = ((b0 as u32 & 0x07) << 18)
562            | ((bytes[1] as u32 & 0x3F) << 12)
563            | ((bytes[2] as u32 & 0x3F) << 6)
564            | (bytes[3] as u32 & 0x3F);
565        return (cp, 4);
566    }
567    (b0 as u32, 1)
568}
569
570/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
571#[inline]
572fn is_wide_char(cp: u32) -> bool {
573    matches!(
574        cp,
575        0x1100..=0x115F   // Hangul Jamo
576        | 0x231A..=0x231B // Watch, Hourglass
577        | 0x2329..=0x232A // Angle Brackets
578        | 0x23E9..=0x23F3 // Various symbols
579        | 0x23F8..=0x23FA
580        | 0x25FD..=0x25FE
581        | 0x2614..=0x2615
582        | 0x2648..=0x2653
583        | 0x267F
584        | 0x2693
585        | 0x26A1
586        | 0x26AA..=0x26AB
587        | 0x26BD..=0x26BE
588        | 0x26C4..=0x26C5
589        | 0x26CE
590        | 0x26D4
591        | 0x26EA
592        | 0x26F2..=0x26F3
593        | 0x26F5
594        | 0x26FA
595        | 0x26FD
596        | 0x2702
597        | 0x2705
598        | 0x2708..=0x270D
599        | 0x270F
600        | 0x2712
601        | 0x2714
602        | 0x2716
603        | 0x271D
604        | 0x2721
605        | 0x2728
606        | 0x2733..=0x2734
607        | 0x2744
608        | 0x2747
609        | 0x274C
610        | 0x274E
611        | 0x2753..=0x2755
612        | 0x2757
613        | 0x2763..=0x2764
614        | 0x2795..=0x2797
615        | 0x27A1
616        | 0x27B0
617        | 0x27BF
618        | 0x2934..=0x2935
619        | 0x2B05..=0x2B07
620        | 0x2B1B..=0x2B1C
621        | 0x2B50
622        | 0x2B55
623        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
624        | 0x3041..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
625        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
626        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
627        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
628        | 0xAC00..=0xD7A3  // Hangul Syllables
629        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
630        | 0xFE10..=0xFE19  // Vertical Forms
631        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
632        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
633        | 0xFFE0..=0xFFE6  // Fullwidth Signs
634        | 0x1F004
635        | 0x1F0CF
636        | 0x1F170..=0x1F171
637        | 0x1F17E..=0x1F17F
638        | 0x1F18E
639        | 0x1F191..=0x1F19A
640        | 0x1F1E0..=0x1F1FF // Regional Indicators
641        | 0x1F200..=0x1F202
642        | 0x1F210..=0x1F23B
643        | 0x1F240..=0x1F248
644        | 0x1F250..=0x1F251
645        | 0x1F260..=0x1F265
646        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
647        | 0x1F680..=0x1F6FF // Transport Symbols
648        | 0x1F900..=0x1F9FF // Supplemental Symbols
649        | 0x1FA00..=0x1FA6F
650        | 0x1FA70..=0x1FAFF
651        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
652        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
653    )
654}
655
656/// Compute maximum display width of any line (C/POSIX locale).
657///
658/// GNU wc -L behavior in C locale:
659/// - `\n`: line terminator (records max, resets position)
660/// - `\t`: advances to next tab stop (multiple of 8)
661/// - `\r`: carriage return (resets position to 0, same line)
662/// - `\f`: form feed (acts as line terminator like \n)
663/// - Printable ASCII (0x20..0x7E): width 1
664/// - Everything else (controls, high bytes): width 0
665pub fn max_line_length_c(data: &[u8]) -> u64 {
666    let mut max_len: u64 = 0;
667    let mut line_len: u64 = 0; // max position seen on current line
668    let mut linepos: u64 = 0; // current cursor position
669
670    for &b in data {
671        match b {
672            b'\n' => {
673                if line_len > max_len {
674                    max_len = line_len;
675                }
676                linepos = 0;
677                line_len = 0;
678            }
679            b'\t' => {
680                linepos = (linepos + 8) & !7;
681                if linepos > line_len {
682                    line_len = linepos;
683                }
684            }
685            b'\r' => {
686                linepos = 0;
687            }
688            0x0C => {
689                // Form feed: acts as line terminator
690                if line_len > max_len {
691                    max_len = line_len;
692                }
693                linepos = 0;
694                line_len = 0;
695            }
696            _ => {
697                if PRINTABLE_TABLE[b as usize] != 0 {
698                    linepos += 1;
699                    if linepos > line_len {
700                        line_len = linepos;
701                    }
702                }
703                // Non-printable: width 0
704            }
705        }
706    }
707
708    // Handle last line (may not end with \n)
709    if line_len > max_len {
710        max_len = line_len;
711    }
712
713    max_len
714}
715
716/// Compute maximum display width of any line (UTF-8 locale).
717///
718/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
719/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
720pub fn max_line_length_utf8(data: &[u8]) -> u64 {
721    let mut max_len: u64 = 0;
722    let mut line_len: u64 = 0;
723    let mut linepos: u64 = 0;
724    let mut i = 0;
725
726    while i < data.len() {
727        let b = data[i];
728
729        // Fast path for common ASCII
730        if b < 0x80 {
731            match b {
732                b'\n' => {
733                    if line_len > max_len {
734                        max_len = line_len;
735                    }
736                    linepos = 0;
737                    line_len = 0;
738                }
739                b'\t' => {
740                    linepos = (linepos + 8) & !7;
741                    if linepos > line_len {
742                        line_len = linepos;
743                    }
744                }
745                b'\r' => {
746                    linepos = 0;
747                }
748                0x0C => {
749                    // Form feed: line terminator
750                    if line_len > max_len {
751                        max_len = line_len;
752                    }
753                    linepos = 0;
754                    line_len = 0;
755                }
756                0x20..=0x7E => {
757                    // Printable ASCII
758                    linepos += 1;
759                    if linepos > line_len {
760                        line_len = linepos;
761                    }
762                }
763                _ => {
764                    // Non-printable ASCII control chars: width 0
765                }
766            }
767            i += 1;
768        } else {
769            // Multibyte UTF-8
770            let (cp, len) = decode_utf8(&data[i..]);
771
772            // C1 control characters (0x80..0x9F): non-printable
773            if cp <= 0x9F {
774                // width 0
775            } else if is_wide_char(cp) {
776                linepos += 2;
777                if linepos > line_len {
778                    line_len = linepos;
779                }
780            } else {
781                // Regular printable Unicode character: width 1
782                linepos += 1;
783                if linepos > line_len {
784                    line_len = linepos;
785                }
786            }
787            i += len;
788        }
789    }
790
791    // Handle last line
792    if line_len > max_len {
793        max_len = line_len;
794    }
795
796    max_len
797}
798
799/// Compute maximum display width, choosing behavior based on locale.
800#[inline]
801pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
802    if utf8 {
803        max_line_length_utf8(data)
804    } else {
805        max_line_length_c(data)
806    }
807}
808
809/// Count all metrics using optimized individual passes.
810///
811/// Each metric uses its own optimized algorithm:
812/// - Lines: SIMD-accelerated memchr
813/// - Words: 3-state scalar/state-machine (locale-dependent)
814/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
815/// - Max line length: locale-aware display width tracking
816///
817/// Multi-pass is faster than single-pass because each pass has a tight,
818/// specialized loop. After the first pass, data is hot in L2/L3 cache,
819/// making subsequent passes nearly free for memory bandwidth.
820pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
821    if utf8 {
822        let (lines, words) = count_lines_words_utf8_fused(data);
823        WcCounts {
824            lines,
825            words,
826            bytes: data.len() as u64,
827            chars: count_chars_utf8(data),
828            max_line_length: max_line_length_utf8(data),
829        }
830    } else {
831        WcCounts {
832            lines: count_lines(data),
833            words: count_words_locale(data, false),
834            bytes: data.len() as u64,
835            chars: data.len() as u64,
836            max_line_length: max_line_length_c(data),
837        }
838    }
839}
840
841// ──────────────────────────────────────────────────
842// Parallel counting for large files
843// ──────────────────────────────────────────────────
844
845/// Count newlines in parallel using SIMD memchr + rayon.
846pub fn count_lines_parallel(data: &[u8]) -> u64 {
847    if data.len() < PARALLEL_THRESHOLD {
848        return count_lines(data);
849    }
850
851    let num_threads = rayon::current_num_threads().max(1);
852    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
853
854    data.par_chunks(chunk_size)
855        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
856        .sum()
857}
858
859/// Count words in parallel with boundary adjustment.
860pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
861    if utf8 || data.len() < PARALLEL_THRESHOLD {
862        // UTF-8: state machine can't be trivially parallelized
863        // (multi-byte sequences may span chunk boundaries).
864        return count_words_locale(data, utf8);
865    }
866
867    // C locale: parallel 3-state word counting with boundary adjustment
868    let num_threads = rayon::current_num_threads().max(1);
869    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
870
871    let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
872
873    // Each chunk returns (word_count, first_active_is_printable, ends_in_word)
874    let results: Vec<(u64, bool, bool)> = chunks
875        .par_iter()
876        .map(|chunk| count_words_c_chunk(chunk))
877        .collect();
878
879    let mut total = 0u64;
880    for i in 0..results.len() {
881        total += results[i].0;
882        // Boundary adjustment: if previous chunk ended in_word AND
883        // current chunk's first non-transparent byte is printable,
884        // the word was split across chunks — subtract the overcount.
885        if i > 0 && results[i - 1].2 && results[i].1 {
886            total -= 1;
887        }
888    }
889    total
890}
891
892/// Count UTF-8 characters in parallel.
893pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
894    if !utf8 {
895        return data.len() as u64;
896    }
897    if data.len() < PARALLEL_THRESHOLD {
898        return count_chars_utf8(data);
899    }
900
901    let num_threads = rayon::current_num_threads().max(1);
902    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
903
904    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
905}
906
907/// Count lines + words + bytes in a single fused pass (the default wc mode).
908/// Avoids separate passes entirely — combines newline counting with word detection.
909pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
910    let (lines, words) = count_lines_words(data, utf8);
911    (lines, words, data.len() as u64)
912}
913
914/// Parallel counting of lines + words + bytes only (no chars).
915/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
916pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
917    if data.len() < PARALLEL_THRESHOLD {
918        // Small file: use fused single-pass
919        return count_lwb(data, utf8);
920    }
921
922    // Word counting must be sequential for UTF-8 (state machine across chunks)
923    // But we use the fused lines+words approach to avoid a separate memchr pass
924    let (lines, words) = if utf8 {
925        count_lines_words_utf8_fused(data)
926    } else {
927        // C locale: parallel 3-state word counting with boundary adjustment
928        let num_threads = rayon::current_num_threads().max(1);
929        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
930
931        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
932        let results: Vec<(u64, bool, bool)> = chunks
933            .par_iter()
934            .map(|chunk| count_words_c_chunk(chunk))
935            .collect();
936
937        let mut word_total = 0u64;
938        for i in 0..results.len() {
939            word_total += results[i].0;
940            if i > 0 && results[i - 1].2 && results[i].1 {
941                word_total -= 1;
942            }
943        }
944
945        let line_total: u64 = data
946            .par_chunks(chunk_size)
947            .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
948            .sum();
949
950        (line_total, word_total)
951    };
952
953    (lines, words, data.len() as u64)
954}
955
956/// Combined parallel counting of lines + words + chars.
957pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
958    if data.len() < PARALLEL_THRESHOLD {
959        let lines = count_lines(data);
960        let words = count_words_locale(data, utf8);
961        let chars = count_chars(data, utf8);
962        return (lines, words, chars);
963    }
964
965    // Word counting: sequential for UTF-8 (state machine), parallel for C locale
966    let words = count_words_parallel(data, utf8);
967
968    // Lines and chars can always be parallelized safely
969    let num_threads = rayon::current_num_threads().max(1);
970    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
971
972    let lines: u64 = data
973        .par_chunks(chunk_size)
974        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
975        .sum();
976
977    let chars = if utf8 {
978        data.par_chunks(chunk_size).map(count_chars_utf8).sum()
979    } else {
980        data.len() as u64
981    };
982
983    (lines, words, chars)
984}
coreutils_rs/wc/core.rs

coreutils_rs/wc/
core.rs