coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (2MB).
5/// Lower threshold lets us exploit 4 cores on smaller files.
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Results from counting a byte slice.
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
10pub struct WcCounts {
11    pub lines: u64,
12    pub words: u64,
13    pub bytes: u64,
14    pub chars: u64,
15    pub max_line_length: u64,
16}
17
18// ──────────────────────────────────────────────────
19// 3-state byte classification for word counting
20// ──────────────────────────────────────────────────
21//
22// GNU wc uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
23//   0 = printable (word content): starts or continues a word
24//   1 = space (word break): ends any current word
25//   2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
26//
27// The critical difference from 2-state is that transparent characters
28// (NUL, control chars, invalid UTF-8) do NOT break words.
29// Example: "hello\x00world" is 1 word (NUL is transparent), not 2.
30
31/// 3-state byte classification for C/POSIX locale.
32/// In C locale, mbrtowc() fails for bytes >= 0x80, making them transparent.
33/// Only printable ASCII (0x21-0x7E) forms words.
34const fn make_byte_class_c() -> [u8; 256] {
35    let mut t = [2u8; 256]; // default: transparent
36    // Spaces: iswspace() returns true
37    t[0x09] = 1; // \t
38    t[0x0A] = 1; // \n
39    t[0x0B] = 1; // \v
40    t[0x0C] = 1; // \f
41    t[0x0D] = 1; // \r
42    t[0x20] = 1; // space
43    // Printable ASCII (0x21-0x7E): word content
44    let mut i = 0x21u16;
45    while i <= 0x7E {
46        t[i as usize] = 0;
47        i += 1;
48    }
49    t
50}
51
52const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
53
54/// 3-state single-byte classification for UTF-8 locale.
55/// Multi-byte UTF-8 sequences are handled by the state machine separately.
56const fn make_byte_class_utf8() -> [u8; 256] {
57    let mut t = [2u8; 256]; // default: transparent
58    // Spaces
59    t[0x09] = 1; // \t
60    t[0x0A] = 1; // \n
61    t[0x0B] = 1; // \v
62    t[0x0C] = 1; // \f
63    t[0x0D] = 1; // \r
64    t[0x20] = 1; // space
65    // Printable ASCII (0x21-0x7E): word content
66    let mut i = 0x21u16;
67    while i <= 0x7E {
68        t[i as usize] = 0;
69        i += 1;
70    }
71    t
72}
73
74const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
75
76/// Printable ASCII lookup table: 0x20 (space) through 0x7E (~) are printable.
77const fn make_printable_table() -> [u8; 256] {
78    let mut t = [0u8; 256];
79    let mut i = 0x20u16;
80    while i <= 0x7E {
81        t[i as usize] = 1;
82        i += 1;
83    }
84    t
85}
86
87const PRINTABLE_TABLE: [u8; 256] = make_printable_table();
88
89// ──────────────────────────────────────────────────
90// Unicode character classification helpers
91// ──────────────────────────────────────────────────
92
93/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
94/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
95#[inline]
96fn is_unicode_space(cp: u32) -> bool {
97    matches!(
98        cp,
99        0x00A0 |           // No-Break Space
100        0x1680 |           // Ogham Space Mark
101        0x2000
102            ..=0x200A |  // En Quad through Hair Space
103        0x2028 |           // Line Separator
104        0x2029 |           // Paragraph Separator
105        0x202F |           // Narrow No-Break Space
106        0x205F |           // Medium Mathematical Space
107        0x3000 // Ideographic Space
108    )
109}
110
111/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
112/// C1 control characters (U+0080-U+009F) are not printable.
113/// Most characters >= U+00A0 are printable.
114#[inline]
115fn is_unicode_printable(cp: u32) -> bool {
116    cp >= 0xA0
117}
118
119// ──────────────────────────────────────────────────
120// Core counting functions
121// ──────────────────────────────────────────────────
122
123/// Count newlines using SIMD-accelerated memchr.
124/// GNU wc counts newline bytes (`\n`), not logical lines.
125#[inline]
126pub fn count_lines(data: &[u8]) -> u64 {
127    memchr_iter(b'\n', data).count() as u64
128}
129
130/// Count bytes. Trivial but included for API consistency.
131#[inline]
132pub fn count_bytes(data: &[u8]) -> u64 {
133    data.len() as u64
134}
135
136/// Count words using locale-aware 3-state logic (default: UTF-8).
137pub fn count_words(data: &[u8]) -> u64 {
138    count_words_locale(data, true)
139}
140
141/// Count words with explicit locale control using 3-state logic.
142///
143/// GNU wc classifies each character as:
144///   - space (iswspace=true): sets in_word=false
145///   - printable (iswprint=true): sets in_word=true, increments word count on transition
146///   - transparent (neither): leaves in_word unchanged
147pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
148    if utf8 {
149        count_words_utf8(data)
150    } else {
151        count_words_c(data)
152    }
153}
154
155/// Count words in C/POSIX locale using 3-state scalar logic.
156/// Only printable ASCII (0x21-0x7E) forms words.
157/// Bytes >= 0x80 and non-printable ASCII controls are transparent.
158fn count_words_c(data: &[u8]) -> u64 {
159    let mut words = 0u64;
160    let mut in_word = false;
161    for &b in data {
162        let class = BYTE_CLASS_C[b as usize];
163        if class == 1 {
164            // Space: break word
165            in_word = false;
166        } else if class == 0 {
167            // Printable: start/continue word
168            if !in_word {
169                in_word = true;
170                words += 1;
171            }
172        }
173        // class == 2: transparent — in_word unchanged
174    }
175    words
176}
177
178/// Count words in a C locale chunk, returning word count plus boundary info.
179/// Used by parallel word counting.
180/// Returns (word_count, first_active_is_printable, ends_in_word).
181fn count_words_c_chunk(data: &[u8]) -> (u64, bool, bool) {
182    let mut words = 0u64;
183    let mut in_word = false;
184    let mut first_active_is_printable = false;
185    let mut seen_active = false;
186
187    for &b in data {
188        let class = BYTE_CLASS_C[b as usize];
189        if class == 1 {
190            if !seen_active {
191                seen_active = true;
192                // first_active_is_printable stays false
193            }
194            in_word = false;
195        } else if class == 0 {
196            if !seen_active {
197                seen_active = true;
198                first_active_is_printable = true;
199            }
200            if !in_word {
201                in_word = true;
202                words += 1;
203            }
204        }
205    }
206    (words, first_active_is_printable, in_word)
207}
208
209/// Count words in UTF-8 locale using a state machine with 3-state logic.
210///
211/// Handles:
212/// - ASCII spaces (0x09-0x0D, 0x20): word break
213/// - ASCII printable (0x21-0x7E): word content
214/// - ASCII non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
215/// - Valid UTF-8 multi-byte → check Unicode space/printable
216/// - Invalid UTF-8: transparent (GNU wc skips invalid bytes without changing state)
217fn count_words_utf8(data: &[u8]) -> u64 {
218    let mut words = 0u64;
219    let mut in_word = false;
220    let mut i = 0;
221
222    while i < data.len() {
223        let b = data[i];
224
225        if b < 0x80 {
226            // ASCII: use 3-state lookup table
227            let class = BYTE_CLASS_UTF8[b as usize];
228            if class == 1 {
229                in_word = false;
230            } else if class == 0 {
231                if !in_word {
232                    in_word = true;
233                    words += 1;
234                }
235            }
236            // class == 2: transparent
237            i += 1;
238        } else if b < 0xC2 {
239            // 0x80-0xBF: standalone continuation byte (invalid UTF-8)
240            // 0xC0-0xC1: overlong encoding (invalid UTF-8)
241            // Transparent: don't change in_word
242            i += 1;
243        } else if b < 0xE0 {
244            // 2-byte sequence: need 1 continuation byte
245            if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
246                let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
247                if is_unicode_space(cp) {
248                    in_word = false;
249                } else if is_unicode_printable(cp) {
250                    if !in_word {
251                        in_word = true;
252                        words += 1;
253                    }
254                }
255                // else: non-printable (e.g., C1 controls U+0080-U+009F) → transparent
256                i += 2;
257            } else {
258                // Invalid sequence: transparent
259                i += 1;
260            }
261        } else if b < 0xF0 {
262            // 3-byte sequence: need 2 continuation bytes
263            if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
264                let cp = ((b as u32 & 0x0F) << 12)
265                    | ((data[i + 1] as u32 & 0x3F) << 6)
266                    | (data[i + 2] as u32 & 0x3F);
267                if is_unicode_space(cp) {
268                    in_word = false;
269                } else if is_unicode_printable(cp) {
270                    if !in_word {
271                        in_word = true;
272                        words += 1;
273                    }
274                }
275                i += 3;
276            } else {
277                // Invalid: transparent
278                i += 1;
279            }
280        } else if b < 0xF5 {
281            // 4-byte sequence: need 3 continuation bytes
282            if i + 3 < data.len()
283                && (data[i + 1] & 0xC0) == 0x80
284                && (data[i + 2] & 0xC0) == 0x80
285                && (data[i + 3] & 0xC0) == 0x80
286            {
287                let cp = ((b as u32 & 0x07) << 18)
288                    | ((data[i + 1] as u32 & 0x3F) << 12)
289                    | ((data[i + 2] as u32 & 0x3F) << 6)
290                    | (data[i + 3] as u32 & 0x3F);
291                if is_unicode_space(cp) {
292                    in_word = false;
293                } else if is_unicode_printable(cp) {
294                    if !in_word {
295                        in_word = true;
296                        words += 1;
297                    }
298                }
299                i += 4;
300            } else {
301                // Invalid: transparent
302                i += 1;
303            }
304        } else {
305            // 0xF5-0xFF: invalid UTF-8 — transparent
306            i += 1;
307        }
308    }
309
310    words
311}
312
313/// Count lines and words using optimized strategies per locale.
314/// UTF-8: separate SIMD memchr + state machine passes.
315/// C locale: single scalar pass with 3-state logic.
316pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
317    if utf8 {
318        let lines = count_lines(data);
319        let words = count_words_utf8(data);
320        (lines, words)
321    } else {
322        let mut lines = 0u64;
323        let mut words = 0u64;
324        let mut in_word = false;
325        for &b in data {
326            if b == b'\n' {
327                lines += 1;
328            }
329            let class = BYTE_CLASS_C[b as usize];
330            if class == 1 {
331                in_word = false;
332            } else if class == 0 {
333                if !in_word {
334                    in_word = true;
335                    words += 1;
336                }
337            }
338        }
339        (lines, words)
340    }
341}
342
343/// Count lines, words, and chars using optimized strategies per locale.
344pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
345    if utf8 {
346        // Three separate optimized passes (data stays cache-hot between passes)
347        let lines = count_lines(data);
348        let words = count_words_utf8(data);
349        let chars = count_chars_utf8(data);
350        (lines, words, chars)
351    } else {
352        // C locale: single pass for lines + words, chars = byte count
353        let mut lines = 0u64;
354        let mut words = 0u64;
355        let mut in_word = false;
356        for &b in data {
357            if b == b'\n' {
358                lines += 1;
359            }
360            let class = BYTE_CLASS_C[b as usize];
361            if class == 1 {
362                in_word = false;
363            } else if class == 0 {
364                if !in_word {
365                    in_word = true;
366                    words += 1;
367                }
368            }
369        }
370        (lines, words, data.len() as u64)
371    }
372}
373
374/// Count UTF-8 characters by counting non-continuation bytes.
375/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
376/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
377///
378/// Uses 64-byte block processing with popcount for ~4x throughput vs scalar.
379pub fn count_chars_utf8(data: &[u8]) -> u64 {
380    let mut count = 0u64;
381    let chunks = data.chunks_exact(64);
382    let remainder = chunks.remainder();
383
384    for chunk in chunks {
385        // Build 64-bit mask: bit i = 1 if chunk[i] is NOT a continuation byte
386        let mut char_mask = 0u64;
387        let mut i = 0;
388        while i + 7 < 64 {
389            char_mask |= (((chunk[i] & 0xC0) != 0x80) as u64) << i;
390            char_mask |= (((chunk[i + 1] & 0xC0) != 0x80) as u64) << (i + 1);
391            char_mask |= (((chunk[i + 2] & 0xC0) != 0x80) as u64) << (i + 2);
392            char_mask |= (((chunk[i + 3] & 0xC0) != 0x80) as u64) << (i + 3);
393            char_mask |= (((chunk[i + 4] & 0xC0) != 0x80) as u64) << (i + 4);
394            char_mask |= (((chunk[i + 5] & 0xC0) != 0x80) as u64) << (i + 5);
395            char_mask |= (((chunk[i + 6] & 0xC0) != 0x80) as u64) << (i + 6);
396            char_mask |= (((chunk[i + 7] & 0xC0) != 0x80) as u64) << (i + 7);
397            i += 8;
398        }
399        count += char_mask.count_ones() as u64;
400    }
401
402    for &b in remainder {
403        count += ((b & 0xC0) != 0x80) as u64;
404    }
405    count
406}
407
408/// Count characters in C/POSIX locale (each byte is one character).
409#[inline]
410pub fn count_chars_c(data: &[u8]) -> u64 {
411    data.len() as u64
412}
413
414/// Count characters, choosing behavior based on locale.
415#[inline]
416pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
417    if utf8 {
418        count_chars_utf8(data)
419    } else {
420        count_chars_c(data)
421    }
422}
423
424/// Detect if the current locale uses UTF-8 encoding.
425pub fn is_utf8_locale() -> bool {
426    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
427        if let Ok(val) = std::env::var(var) {
428            if !val.is_empty() {
429                let lower = val.to_ascii_lowercase();
430                return lower.contains("utf-8") || lower.contains("utf8");
431            }
432        }
433    }
434    false
435}
436
437/// Decode one UTF-8 character from a byte slice.
438/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
439#[inline]
440fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
441    let b0 = bytes[0];
442    if b0 < 0x80 {
443        return (b0 as u32, 1);
444    }
445    if b0 < 0xC2 {
446        // Continuation byte or overlong 2-byte — invalid as start
447        return (b0 as u32, 1);
448    }
449    if b0 < 0xE0 {
450        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
451            return (b0 as u32, 1);
452        }
453        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
454        return (cp, 2);
455    }
456    if b0 < 0xF0 {
457        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
458            return (b0 as u32, 1);
459        }
460        let cp =
461            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
462        return (cp, 3);
463    }
464    if b0 < 0xF5 {
465        if bytes.len() < 4
466            || bytes[1] & 0xC0 != 0x80
467            || bytes[2] & 0xC0 != 0x80
468            || bytes[3] & 0xC0 != 0x80
469        {
470            return (b0 as u32, 1);
471        }
472        let cp = ((b0 as u32 & 0x07) << 18)
473            | ((bytes[1] as u32 & 0x3F) << 12)
474            | ((bytes[2] as u32 & 0x3F) << 6)
475            | (bytes[3] as u32 & 0x3F);
476        return (cp, 4);
477    }
478    (b0 as u32, 1)
479}
480
481/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
482#[inline]
483fn is_wide_char(cp: u32) -> bool {
484    matches!(
485        cp,
486        0x1100..=0x115F   // Hangul Jamo
487        | 0x231A..=0x231B // Watch, Hourglass
488        | 0x2329..=0x232A // Angle Brackets
489        | 0x23E9..=0x23F3 // Various symbols
490        | 0x23F8..=0x23FA
491        | 0x25FD..=0x25FE
492        | 0x2614..=0x2615
493        | 0x2648..=0x2653
494        | 0x267F
495        | 0x2693
496        | 0x26A1
497        | 0x26AA..=0x26AB
498        | 0x26BD..=0x26BE
499        | 0x26C4..=0x26C5
500        | 0x26CE
501        | 0x26D4
502        | 0x26EA
503        | 0x26F2..=0x26F3
504        | 0x26F5
505        | 0x26FA
506        | 0x26FD
507        | 0x2702
508        | 0x2705
509        | 0x2708..=0x270D
510        | 0x270F
511        | 0x2712
512        | 0x2714
513        | 0x2716
514        | 0x271D
515        | 0x2721
516        | 0x2728
517        | 0x2733..=0x2734
518        | 0x2744
519        | 0x2747
520        | 0x274C
521        | 0x274E
522        | 0x2753..=0x2755
523        | 0x2757
524        | 0x2763..=0x2764
525        | 0x2795..=0x2797
526        | 0x27A1
527        | 0x27B0
528        | 0x27BF
529        | 0x2934..=0x2935
530        | 0x2B05..=0x2B07
531        | 0x2B1B..=0x2B1C
532        | 0x2B50
533        | 0x2B55
534        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
535        | 0x3041..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
536        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
537        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
538        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
539        | 0xAC00..=0xD7A3  // Hangul Syllables
540        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
541        | 0xFE10..=0xFE19  // Vertical Forms
542        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
543        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
544        | 0xFFE0..=0xFFE6  // Fullwidth Signs
545        | 0x1F004
546        | 0x1F0CF
547        | 0x1F170..=0x1F171
548        | 0x1F17E..=0x1F17F
549        | 0x1F18E
550        | 0x1F191..=0x1F19A
551        | 0x1F1E0..=0x1F1FF // Regional Indicators
552        | 0x1F200..=0x1F202
553        | 0x1F210..=0x1F23B
554        | 0x1F240..=0x1F248
555        | 0x1F250..=0x1F251
556        | 0x1F260..=0x1F265
557        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
558        | 0x1F680..=0x1F6FF // Transport Symbols
559        | 0x1F900..=0x1F9FF // Supplemental Symbols
560        | 0x1FA00..=0x1FA6F
561        | 0x1FA70..=0x1FAFF
562        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
563        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
564    )
565}
566
567/// Compute maximum display width of any line (C/POSIX locale).
568///
569/// GNU wc -L behavior in C locale:
570/// - `\n`: line terminator (records max, resets position)
571/// - `\t`: advances to next tab stop (multiple of 8)
572/// - `\r`: carriage return (resets position to 0, same line)
573/// - `\f`: form feed (acts as line terminator like \n)
574/// - Printable ASCII (0x20..0x7E): width 1
575/// - Everything else (controls, high bytes): width 0
576pub fn max_line_length_c(data: &[u8]) -> u64 {
577    let mut max_len: u64 = 0;
578    let mut line_len: u64 = 0; // max position seen on current line
579    let mut linepos: u64 = 0; // current cursor position
580
581    for &b in data {
582        match b {
583            b'\n' => {
584                if line_len > max_len {
585                    max_len = line_len;
586                }
587                linepos = 0;
588                line_len = 0;
589            }
590            b'\t' => {
591                linepos = (linepos + 8) & !7;
592                if linepos > line_len {
593                    line_len = linepos;
594                }
595            }
596            b'\r' => {
597                linepos = 0;
598            }
599            0x0C => {
600                // Form feed: acts as line terminator
601                if line_len > max_len {
602                    max_len = line_len;
603                }
604                linepos = 0;
605                line_len = 0;
606            }
607            _ => {
608                if PRINTABLE_TABLE[b as usize] != 0 {
609                    linepos += 1;
610                    if linepos > line_len {
611                        line_len = linepos;
612                    }
613                }
614                // Non-printable: width 0
615            }
616        }
617    }
618
619    // Handle last line (may not end with \n)
620    if line_len > max_len {
621        max_len = line_len;
622    }
623
624    max_len
625}
626
627/// Compute maximum display width of any line (UTF-8 locale).
628///
629/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
630/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
631pub fn max_line_length_utf8(data: &[u8]) -> u64 {
632    let mut max_len: u64 = 0;
633    let mut line_len: u64 = 0;
634    let mut linepos: u64 = 0;
635    let mut i = 0;
636
637    while i < data.len() {
638        let b = data[i];
639
640        // Fast path for common ASCII
641        if b < 0x80 {
642            match b {
643                b'\n' => {
644                    if line_len > max_len {
645                        max_len = line_len;
646                    }
647                    linepos = 0;
648                    line_len = 0;
649                }
650                b'\t' => {
651                    linepos = (linepos + 8) & !7;
652                    if linepos > line_len {
653                        line_len = linepos;
654                    }
655                }
656                b'\r' => {
657                    linepos = 0;
658                }
659                0x0C => {
660                    // Form feed: line terminator
661                    if line_len > max_len {
662                        max_len = line_len;
663                    }
664                    linepos = 0;
665                    line_len = 0;
666                }
667                0x20..=0x7E => {
668                    // Printable ASCII
669                    linepos += 1;
670                    if linepos > line_len {
671                        line_len = linepos;
672                    }
673                }
674                _ => {
675                    // Non-printable ASCII control chars: width 0
676                }
677            }
678            i += 1;
679        } else {
680            // Multibyte UTF-8
681            let (cp, len) = decode_utf8(&data[i..]);
682
683            // C1 control characters (0x80..0x9F): non-printable
684            if cp <= 0x9F {
685                // width 0
686            } else if is_wide_char(cp) {
687                linepos += 2;
688                if linepos > line_len {
689                    line_len = linepos;
690                }
691            } else {
692                // Regular printable Unicode character: width 1
693                linepos += 1;
694                if linepos > line_len {
695                    line_len = linepos;
696                }
697            }
698            i += len;
699        }
700    }
701
702    // Handle last line
703    if line_len > max_len {
704        max_len = line_len;
705    }
706
707    max_len
708}
709
710/// Compute maximum display width, choosing behavior based on locale.
711#[inline]
712pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
713    if utf8 {
714        max_line_length_utf8(data)
715    } else {
716        max_line_length_c(data)
717    }
718}
719
720/// Count all metrics using optimized individual passes.
721///
722/// Each metric uses its own optimized algorithm:
723/// - Lines: SIMD-accelerated memchr
724/// - Words: 3-state scalar/state-machine (locale-dependent)
725/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
726/// - Max line length: locale-aware display width tracking
727///
728/// Multi-pass is faster than single-pass because each pass has a tight,
729/// specialized loop. After the first pass, data is hot in L2/L3 cache,
730/// making subsequent passes nearly free for memory bandwidth.
731pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
732    WcCounts {
733        lines: count_lines(data),
734        words: count_words_locale(data, utf8),
735        bytes: data.len() as u64,
736        chars: count_chars(data, utf8),
737        max_line_length: max_line_length(data, utf8),
738    }
739}
740
741// ──────────────────────────────────────────────────
742// Parallel counting for large files
743// ──────────────────────────────────────────────────
744
745/// Count newlines in parallel using SIMD memchr + rayon.
746pub fn count_lines_parallel(data: &[u8]) -> u64 {
747    if data.len() < PARALLEL_THRESHOLD {
748        return count_lines(data);
749    }
750
751    let num_threads = rayon::current_num_threads().max(1);
752    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
753
754    data.par_chunks(chunk_size)
755        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
756        .sum()
757}
758
759/// Count words in parallel with boundary adjustment.
760pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
761    if utf8 || data.len() < PARALLEL_THRESHOLD {
762        // UTF-8: state machine can't be trivially parallelized
763        // (multi-byte sequences may span chunk boundaries).
764        return count_words_locale(data, utf8);
765    }
766
767    // C locale: parallel 3-state word counting with boundary adjustment
768    let num_threads = rayon::current_num_threads().max(1);
769    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
770
771    let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
772
773    // Each chunk returns (word_count, first_active_is_printable, ends_in_word)
774    let results: Vec<(u64, bool, bool)> = chunks
775        .par_iter()
776        .map(|chunk| count_words_c_chunk(chunk))
777        .collect();
778
779    let mut total = 0u64;
780    for i in 0..results.len() {
781        total += results[i].0;
782        // Boundary adjustment: if previous chunk ended in_word AND
783        // current chunk's first non-transparent byte is printable,
784        // the word was split across chunks — subtract the overcount.
785        if i > 0 && results[i - 1].2 && results[i].1 {
786            total -= 1;
787        }
788    }
789    total
790}
791
792/// Count UTF-8 characters in parallel.
793pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
794    if !utf8 {
795        return data.len() as u64;
796    }
797    if data.len() < PARALLEL_THRESHOLD {
798        return count_chars_utf8(data);
799    }
800
801    let num_threads = rayon::current_num_threads().max(1);
802    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
803
804    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
805}
806
807/// Combined parallel counting of lines + words + chars.
808pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
809    if data.len() < PARALLEL_THRESHOLD {
810        let lines = count_lines(data);
811        let words = count_words_locale(data, utf8);
812        let chars = count_chars(data, utf8);
813        return (lines, words, chars);
814    }
815
816    // Word counting: sequential for UTF-8 (state machine), parallel for C locale
817    let words = count_words_parallel(data, utf8);
818
819    // Lines and chars can always be parallelized safely
820    let num_threads = rayon::current_num_threads().max(1);
821    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
822
823    let lines: u64 = data
824        .par_chunks(chunk_size)
825        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
826        .sum();
827
828    let chars = if utf8 {
829        data.par_chunks(chunk_size).map(count_chars_utf8).sum()
830    } else {
831        data.len() as u64
832    };
833
834    (lines, words, chars)
835}
coreutils_rs/wc/core.rs

coreutils_rs/wc/
core.rs