coreutils_rs/wc/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (2MB).
5/// Lower threshold lets us exploit 4 cores on smaller files.
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Results from counting a byte slice.
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
10pub struct WcCounts {
11    pub lines: u64,
12    pub words: u64,
13    pub bytes: u64,
14    pub chars: u64,
15    pub max_line_length: u64,
16}
17
18// ──────────────────────────────────────────────────
19// 3-state byte classification for word counting
20// ──────────────────────────────────────────────────
21//
22// GNU wc uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
23//   0 = printable (word content): starts or continues a word
24//   1 = space (word break): ends any current word
25//   2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
26//
27// The critical difference from 2-state is that transparent characters
28// (NUL, control chars, invalid UTF-8) do NOT break words.
29// Example: "hello\x00world" is 1 word (NUL is transparent), not 2.
30
31/// 3-state byte classification for C/POSIX locale.
32/// In C locale, mbrtowc() fails for bytes >= 0x80, making them transparent.
33/// Only printable ASCII (0x21-0x7E) forms words.
34const fn make_byte_class_c() -> [u8; 256] {
35    let mut t = [2u8; 256]; // default: transparent
36    // Spaces: iswspace() returns true
37    t[0x09] = 1; // \t
38    t[0x0A] = 1; // \n
39    t[0x0B] = 1; // \v
40    t[0x0C] = 1; // \f
41    t[0x0D] = 1; // \r
42    t[0x20] = 1; // space
43    // GNU compat: null byte is treated as printable (word content) in C locale.
44    // mbrtowc() returns L'\0' for the null byte, and GNU wc treats it as
45    // a non-space printable character that starts/continues words.
46    t[0x00] = 0;
47    // Printable ASCII (0x21-0x7E): word content
48    let mut i = 0x21u16;
49    while i <= 0x7E {
50        t[i as usize] = 0;
51        i += 1;
52    }
53    t
54}
55
56const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
57
58/// 3-state single-byte classification for UTF-8 locale.
59/// Multi-byte UTF-8 sequences are handled by the state machine separately.
60const fn make_byte_class_utf8() -> [u8; 256] {
61    let mut t = [2u8; 256]; // default: transparent
62    // Spaces
63    t[0x09] = 1; // \t
64    t[0x0A] = 1; // \n
65    t[0x0B] = 1; // \v
66    t[0x0C] = 1; // \f
67    t[0x0D] = 1; // \r
68    t[0x20] = 1; // space
69    // Printable ASCII (0x21-0x7E): word content
70    let mut i = 0x21u16;
71    while i <= 0x7E {
72        t[i as usize] = 0;
73        i += 1;
74    }
75    t
76}
77
78const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
79
80/// Printable ASCII lookup table: 0x20 (space) through 0x7E (~) are printable.
81const fn make_printable_table() -> [u8; 256] {
82    let mut t = [0u8; 256];
83    let mut i = 0x20u16;
84    while i <= 0x7E {
85        t[i as usize] = 1;
86        i += 1;
87    }
88    t
89}
90
91const PRINTABLE_TABLE: [u8; 256] = make_printable_table();
92
93// ──────────────────────────────────────────────────
94// Unicode character classification helpers
95// ──────────────────────────────────────────────────
96
97/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
98/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
99#[inline]
100fn is_unicode_space(cp: u32) -> bool {
101    matches!(
102        cp,
103        0x00A0 |           // No-Break Space
104        0x1680 |           // Ogham Space Mark
105        0x2000
106            ..=0x200A |  // En Quad through Hair Space
107        0x2028 |           // Line Separator
108        0x2029 |           // Paragraph Separator
109        0x202F |           // Narrow No-Break Space
110        0x205F |           // Medium Mathematical Space
111        0x3000 // Ideographic Space
112    )
113}
114
115/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
116/// C1 control characters (U+0080-U+009F) are not printable.
117/// Most characters >= U+00A0 are printable.
118#[inline]
119fn is_unicode_printable(cp: u32) -> bool {
120    cp >= 0xA0
121}
122
123// ──────────────────────────────────────────────────
124// Core counting functions
125// ──────────────────────────────────────────────────
126
127/// Count newlines using SIMD-accelerated memchr.
128/// GNU wc counts newline bytes (`\n`), not logical lines.
129#[inline]
130pub fn count_lines(data: &[u8]) -> u64 {
131    memchr_iter(b'\n', data).count() as u64
132}
133
134/// Count bytes. Trivial but included for API consistency.
135#[inline]
136pub fn count_bytes(data: &[u8]) -> u64 {
137    data.len() as u64
138}
139
140/// Count words using locale-aware 3-state logic (default: UTF-8).
141pub fn count_words(data: &[u8]) -> u64 {
142    count_words_locale(data, true)
143}
144
145/// Count words with explicit locale control using 3-state logic.
146///
147/// GNU wc classifies each character as:
148///   - space (iswspace=true): sets in_word=false
149///   - printable (iswprint=true): sets in_word=true, increments word count on transition
150///   - transparent (neither): leaves in_word unchanged
151pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
152    if utf8 {
153        count_words_utf8(data)
154    } else {
155        count_words_c(data)
156    }
157}
158
159/// Count words in C/POSIX locale using 3-state scalar logic.
160/// Only printable ASCII (0x21-0x7E) forms words.
161/// Bytes >= 0x80 and non-printable ASCII controls are transparent.
162fn count_words_c(data: &[u8]) -> u64 {
163    let mut words = 0u64;
164    let mut in_word = false;
165    for &b in data {
166        let class = BYTE_CLASS_C[b as usize];
167        if class == 1 {
168            // Space: break word
169            in_word = false;
170        } else if class == 0 {
171            // Printable: start/continue word
172            if !in_word {
173                in_word = true;
174                words += 1;
175            }
176        }
177        // class == 2: transparent — in_word unchanged
178    }
179    words
180}
181
182/// Count words in a C locale chunk, returning word count plus boundary info.
183/// Used by parallel word counting.
184/// Returns (word_count, first_active_is_printable, ends_in_word).
185fn count_words_c_chunk(data: &[u8]) -> (u64, bool, bool) {
186    let mut words = 0u64;
187    let mut in_word = false;
188    let mut first_active_is_printable = false;
189    let mut seen_active = false;
190
191    for &b in data {
192        let class = BYTE_CLASS_C[b as usize];
193        if class == 1 {
194            if !seen_active {
195                seen_active = true;
196                // first_active_is_printable stays false
197            }
198            in_word = false;
199        } else if class == 0 {
200            if !seen_active {
201                seen_active = true;
202                first_active_is_printable = true;
203            }
204            if !in_word {
205                in_word = true;
206                words += 1;
207            }
208        }
209    }
210    (words, first_active_is_printable, in_word)
211}
212
213/// Count words in UTF-8 locale using a state machine with 3-state logic.
214///
215/// Handles:
216/// - ASCII spaces (0x09-0x0D, 0x20): word break
217/// - ASCII printable (0x21-0x7E): word content
218/// - ASCII non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
219/// - Valid UTF-8 multi-byte → check Unicode space/printable
220/// - Invalid UTF-8: transparent (GNU wc skips invalid bytes without changing state)
221fn count_words_utf8(data: &[u8]) -> u64 {
222    let mut words = 0u64;
223    let mut in_word = false;
224    let mut i = 0;
225
226    while i < data.len() {
227        let b = data[i];
228
229        if b < 0x80 {
230            // ASCII: use 3-state lookup table
231            let class = BYTE_CLASS_UTF8[b as usize];
232            if class == 1 {
233                in_word = false;
234            } else if class == 0 {
235                if !in_word {
236                    in_word = true;
237                    words += 1;
238                }
239            }
240            // class == 2: transparent
241            i += 1;
242        } else if b < 0xC2 {
243            // 0x80-0xBF: standalone continuation byte (invalid UTF-8)
244            // 0xC0-0xC1: overlong encoding (invalid UTF-8)
245            // Transparent: don't change in_word
246            i += 1;
247        } else if b < 0xE0 {
248            // 2-byte sequence: need 1 continuation byte
249            if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
250                let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
251                if is_unicode_space(cp) {
252                    in_word = false;
253                } else if is_unicode_printable(cp) {
254                    if !in_word {
255                        in_word = true;
256                        words += 1;
257                    }
258                }
259                // else: non-printable (e.g., C1 controls U+0080-U+009F) → transparent
260                i += 2;
261            } else {
262                // Invalid sequence: transparent
263                i += 1;
264            }
265        } else if b < 0xF0 {
266            // 3-byte sequence: need 2 continuation bytes
267            if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
268                let cp = ((b as u32 & 0x0F) << 12)
269                    | ((data[i + 1] as u32 & 0x3F) << 6)
270                    | (data[i + 2] as u32 & 0x3F);
271                if is_unicode_space(cp) {
272                    in_word = false;
273                } else if is_unicode_printable(cp) {
274                    if !in_word {
275                        in_word = true;
276                        words += 1;
277                    }
278                }
279                i += 3;
280            } else {
281                // Invalid: transparent
282                i += 1;
283            }
284        } else if b < 0xF5 {
285            // 4-byte sequence: need 3 continuation bytes
286            if i + 3 < data.len()
287                && (data[i + 1] & 0xC0) == 0x80
288                && (data[i + 2] & 0xC0) == 0x80
289                && (data[i + 3] & 0xC0) == 0x80
290            {
291                let cp = ((b as u32 & 0x07) << 18)
292                    | ((data[i + 1] as u32 & 0x3F) << 12)
293                    | ((data[i + 2] as u32 & 0x3F) << 6)
294                    | (data[i + 3] as u32 & 0x3F);
295                if is_unicode_space(cp) {
296                    in_word = false;
297                } else if is_unicode_printable(cp) {
298                    if !in_word {
299                        in_word = true;
300                        words += 1;
301                    }
302                }
303                i += 4;
304            } else {
305                // Invalid: transparent
306                i += 1;
307            }
308        } else {
309            // 0xF5-0xFF: invalid UTF-8 — transparent
310            i += 1;
311        }
312    }
313
314    words
315}
316
317/// Count lines and words using optimized strategies per locale.
318/// UTF-8: fused single-pass for lines+words to avoid extra data traversal.
319/// C locale: single scalar pass with 3-state logic.
320pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
321    if utf8 {
322        count_lines_words_utf8_fused(data)
323    } else {
324        let mut lines = 0u64;
325        let mut words = 0u64;
326        let mut in_word = false;
327        for &b in data {
328            if b == b'\n' {
329                lines += 1;
330            }
331            let class = BYTE_CLASS_C[b as usize];
332            if class == 1 {
333                in_word = false;
334            } else if class == 0 {
335                if !in_word {
336                    in_word = true;
337                    words += 1;
338                }
339            }
340        }
341        (lines, words)
342    }
343}
344
345/// Fused lines+words counting in UTF-8 mode (single pass).
346/// Avoids separate memchr pass for newlines by counting them inline with words.
347fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
348    let mut lines = 0u64;
349    let mut words = 0u64;
350    let mut in_word = false;
351    let mut i = 0;
352
353    while i < data.len() {
354        let b = data[i];
355
356        if b < 0x80 {
357            // ASCII fast path: combined newline + word counting
358            if b == b'\n' {
359                lines += 1;
360                in_word = false;
361            } else {
362                let class = BYTE_CLASS_UTF8[b as usize];
363                if class == 1 {
364                    in_word = false;
365                } else if class == 0 {
366                    if !in_word {
367                        in_word = true;
368                        words += 1;
369                    }
370                }
371            }
372            i += 1;
373        } else if b < 0xC2 {
374            i += 1;
375        } else if b < 0xE0 {
376            if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
377                let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
378                if is_unicode_space(cp) {
379                    in_word = false;
380                } else if is_unicode_printable(cp) {
381                    if !in_word {
382                        in_word = true;
383                        words += 1;
384                    }
385                }
386                i += 2;
387            } else {
388                i += 1;
389            }
390        } else if b < 0xF0 {
391            if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
392                let cp = ((b as u32 & 0x0F) << 12)
393                    | ((data[i + 1] as u32 & 0x3F) << 6)
394                    | (data[i + 2] as u32 & 0x3F);
395                if is_unicode_space(cp) {
396                    in_word = false;
397                } else if is_unicode_printable(cp) {
398                    if !in_word {
399                        in_word = true;
400                        words += 1;
401                    }
402                }
403                i += 3;
404            } else {
405                i += 1;
406            }
407        } else if b < 0xF5 {
408            if i + 3 < data.len()
409                && (data[i + 1] & 0xC0) == 0x80
410                && (data[i + 2] & 0xC0) == 0x80
411                && (data[i + 3] & 0xC0) == 0x80
412            {
413                let cp = ((b as u32 & 0x07) << 18)
414                    | ((data[i + 1] as u32 & 0x3F) << 12)
415                    | ((data[i + 2] as u32 & 0x3F) << 6)
416                    | (data[i + 3] as u32 & 0x3F);
417                if is_unicode_space(cp) {
418                    in_word = false;
419                } else if is_unicode_printable(cp) {
420                    if !in_word {
421                        in_word = true;
422                        words += 1;
423                    }
424                }
425                i += 4;
426            } else {
427                i += 1;
428            }
429        } else {
430            i += 1;
431        }
432    }
433
434    (lines, words)
435}
436
437/// Count lines, words, and chars using optimized strategies per locale.
438pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
439    if utf8 {
440        // Fused single-pass for lines+words, then fast char-counting pass
441        let (lines, words) = count_lines_words_utf8_fused(data);
442        let chars = count_chars_utf8(data);
443        (lines, words, chars)
444    } else {
445        // C locale: single pass for lines + words, chars = byte count
446        let mut lines = 0u64;
447        let mut words = 0u64;
448        let mut in_word = false;
449        for &b in data {
450            if b == b'\n' {
451                lines += 1;
452            }
453            let class = BYTE_CLASS_C[b as usize];
454            if class == 1 {
455                in_word = false;
456            } else if class == 0 {
457                if !in_word {
458                    in_word = true;
459                    words += 1;
460                }
461            }
462        }
463        (lines, words, data.len() as u64)
464    }
465}
466
467/// Count UTF-8 characters by counting non-continuation bytes.
468/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
469/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
470///
471/// Uses 64-byte block processing with popcount for ~4x throughput vs scalar.
472pub fn count_chars_utf8(data: &[u8]) -> u64 {
473    let mut count = 0u64;
474    let chunks = data.chunks_exact(64);
475    let remainder = chunks.remainder();
476
477    for chunk in chunks {
478        // Build 64-bit mask: bit i = 1 if chunk[i] is NOT a continuation byte
479        let mut char_mask = 0u64;
480        let mut i = 0;
481        while i + 7 < 64 {
482            char_mask |= (((chunk[i] & 0xC0) != 0x80) as u64) << i;
483            char_mask |= (((chunk[i + 1] & 0xC0) != 0x80) as u64) << (i + 1);
484            char_mask |= (((chunk[i + 2] & 0xC0) != 0x80) as u64) << (i + 2);
485            char_mask |= (((chunk[i + 3] & 0xC0) != 0x80) as u64) << (i + 3);
486            char_mask |= (((chunk[i + 4] & 0xC0) != 0x80) as u64) << (i + 4);
487            char_mask |= (((chunk[i + 5] & 0xC0) != 0x80) as u64) << (i + 5);
488            char_mask |= (((chunk[i + 6] & 0xC0) != 0x80) as u64) << (i + 6);
489            char_mask |= (((chunk[i + 7] & 0xC0) != 0x80) as u64) << (i + 7);
490            i += 8;
491        }
492        count += char_mask.count_ones() as u64;
493    }
494
495    for &b in remainder {
496        count += ((b & 0xC0) != 0x80) as u64;
497    }
498    count
499}
500
501/// Count characters in C/POSIX locale (each byte is one character).
502#[inline]
503pub fn count_chars_c(data: &[u8]) -> u64 {
504    data.len() as u64
505}
506
507/// Count characters, choosing behavior based on locale.
508#[inline]
509pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
510    if utf8 {
511        count_chars_utf8(data)
512    } else {
513        count_chars_c(data)
514    }
515}
516
517/// Detect if the current locale uses UTF-8 encoding.
518pub fn is_utf8_locale() -> bool {
519    for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
520        if let Ok(val) = std::env::var(var) {
521            if !val.is_empty() {
522                let lower = val.to_ascii_lowercase();
523                return lower.contains("utf-8") || lower.contains("utf8");
524            }
525        }
526    }
527    false
528}
529
530/// Decode one UTF-8 character from a byte slice.
531/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
532#[inline]
533fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
534    let b0 = bytes[0];
535    if b0 < 0x80 {
536        return (b0 as u32, 1);
537    }
538    if b0 < 0xC2 {
539        // Continuation byte or overlong 2-byte — invalid as start
540        return (b0 as u32, 1);
541    }
542    if b0 < 0xE0 {
543        if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
544            return (b0 as u32, 1);
545        }
546        let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
547        return (cp, 2);
548    }
549    if b0 < 0xF0 {
550        if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
551            return (b0 as u32, 1);
552        }
553        let cp =
554            ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
555        return (cp, 3);
556    }
557    if b0 < 0xF5 {
558        if bytes.len() < 4
559            || bytes[1] & 0xC0 != 0x80
560            || bytes[2] & 0xC0 != 0x80
561            || bytes[3] & 0xC0 != 0x80
562        {
563            return (b0 as u32, 1);
564        }
565        let cp = ((b0 as u32 & 0x07) << 18)
566            | ((bytes[1] as u32 & 0x3F) << 12)
567            | ((bytes[2] as u32 & 0x3F) << 6)
568            | (bytes[3] as u32 & 0x3F);
569        return (cp, 4);
570    }
571    (b0 as u32, 1)
572}
573
574/// Check if a Unicode codepoint is a zero-width character (combining mark, etc.).
575/// GNU wc uses wcwidth() which returns 0 for these. We must match.
576#[inline]
577fn is_zero_width(cp: u32) -> bool {
578    matches!(
579        cp,
580        0x0300..=0x036F   // Combining Diacritical Marks
581        | 0x0483..=0x0489 // Cyrillic combining marks
582        | 0x0591..=0x05BD // Hebrew combining marks
583        | 0x05BF
584        | 0x05C1..=0x05C2
585        | 0x05C4..=0x05C5
586        | 0x05C7
587        | 0x0600..=0x0605 // Arabic number signs
588        | 0x0610..=0x061A // Arabic combining marks
589        | 0x064B..=0x065F // Arabic combining marks
590        | 0x0670
591        | 0x06D6..=0x06DD
592        | 0x06DF..=0x06E4
593        | 0x06E7..=0x06E8
594        | 0x06EA..=0x06ED
595        | 0x070F
596        | 0x0711
597        | 0x0730..=0x074A
598        | 0x07A6..=0x07B0
599        | 0x07EB..=0x07F3
600        | 0x07FD
601        | 0x0816..=0x0819
602        | 0x081B..=0x0823
603        | 0x0825..=0x0827
604        | 0x0829..=0x082D
605        | 0x0859..=0x085B
606        | 0x08D3..=0x08E1
607        | 0x08E3..=0x0902
608        | 0x093A
609        | 0x093C
610        | 0x0941..=0x0948
611        | 0x094D
612        | 0x0951..=0x0957
613        | 0x0962..=0x0963
614        | 0x0981
615        | 0x09BC
616        | 0x09C1..=0x09C4
617        | 0x09CD
618        | 0x09E2..=0x09E3
619        | 0x09FE
620        | 0x0A01..=0x0A02
621        | 0x0A3C
622        | 0x0A41..=0x0A42
623        | 0x0A47..=0x0A48
624        | 0x0A4B..=0x0A4D
625        | 0x0A51
626        | 0x0A70..=0x0A71
627        | 0x0A75
628        | 0x0A81..=0x0A82
629        | 0x0ABC
630        | 0x0AC1..=0x0AC5
631        | 0x0AC7..=0x0AC8
632        | 0x0ACD
633        | 0x0AE2..=0x0AE3
634        | 0x0AFA..=0x0AFF
635        | 0x0B01
636        | 0x0B3C
637        | 0x0B3F
638        | 0x0B41..=0x0B44
639        | 0x0B4D
640        | 0x0B56
641        | 0x0B62..=0x0B63
642        | 0x0B82
643        | 0x0BC0
644        | 0x0BCD
645        | 0x0C00
646        | 0x0C04
647        | 0x0C3E..=0x0C40
648        | 0x0C46..=0x0C48
649        | 0x0C4A..=0x0C4D
650        | 0x0C55..=0x0C56
651        | 0x0C62..=0x0C63
652        | 0x0C81
653        | 0x0CBC
654        | 0x0CBF
655        | 0x0CC6
656        | 0x0CCC..=0x0CCD
657        | 0x0CE2..=0x0CE3
658        | 0x0D00..=0x0D01
659        | 0x0D3B..=0x0D3C
660        | 0x0D41..=0x0D44
661        | 0x0D4D
662        | 0x0D62..=0x0D63
663        | 0x0DCA
664        | 0x0DD2..=0x0DD4
665        | 0x0DD6
666        | 0x0E31
667        | 0x0E34..=0x0E3A
668        | 0x0E47..=0x0E4E
669        | 0x0EB1
670        | 0x0EB4..=0x0EBC
671        | 0x0EC8..=0x0ECD
672        | 0x0F18..=0x0F19
673        | 0x0F35
674        | 0x0F37
675        | 0x0F39
676        | 0x0F71..=0x0F7E
677        | 0x0F80..=0x0F84
678        | 0x0F86..=0x0F87
679        | 0x0F8D..=0x0F97
680        | 0x0F99..=0x0FBC
681        | 0x0FC6
682        | 0x102D..=0x1030
683        | 0x1032..=0x1037
684        | 0x1039..=0x103A
685        | 0x103D..=0x103E
686        | 0x1058..=0x1059
687        | 0x105E..=0x1060
688        | 0x1071..=0x1074
689        | 0x1082
690        | 0x1085..=0x1086
691        | 0x108D
692        | 0x109D
693        | 0x1160..=0x11FF // Hangul Jamo medial vowels and final consonants
694        | 0x135D..=0x135F
695        | 0x1712..=0x1714
696        | 0x1732..=0x1734
697        | 0x1752..=0x1753
698        | 0x1772..=0x1773
699        | 0x17B4..=0x17B5
700        | 0x17B7..=0x17BD
701        | 0x17C6
702        | 0x17C9..=0x17D3
703        | 0x17DD
704        | 0x180B..=0x180D
705        | 0x1885..=0x1886
706        | 0x18A9
707        | 0x1920..=0x1922
708        | 0x1927..=0x1928
709        | 0x1932
710        | 0x1939..=0x193B
711        | 0x1A17..=0x1A18
712        | 0x1A1B
713        | 0x1A56
714        | 0x1A58..=0x1A5E
715        | 0x1A60
716        | 0x1A62
717        | 0x1A65..=0x1A6C
718        | 0x1A73..=0x1A7C
719        | 0x1A7F
720        | 0x1AB0..=0x1ABE
721        | 0x1B00..=0x1B03
722        | 0x1B34
723        | 0x1B36..=0x1B3A
724        | 0x1B3C
725        | 0x1B42
726        | 0x1B6B..=0x1B73
727        | 0x1B80..=0x1B81
728        | 0x1BA2..=0x1BA5
729        | 0x1BA8..=0x1BA9
730        | 0x1BAB..=0x1BAD
731        | 0x1BE6
732        | 0x1BE8..=0x1BE9
733        | 0x1BED
734        | 0x1BEF..=0x1BF1
735        | 0x1C2C..=0x1C33
736        | 0x1C36..=0x1C37
737        | 0x1CD0..=0x1CD2
738        | 0x1CD4..=0x1CE0
739        | 0x1CE2..=0x1CE8
740        | 0x1CED
741        | 0x1CF4
742        | 0x1CF8..=0x1CF9
743        | 0x1DC0..=0x1DF9
744        | 0x1DFB..=0x1DFF
745        | 0x200B..=0x200F // Zero-width space, ZWNJ, ZWJ, LRM, RLM
746        | 0x202A..=0x202E // Bidi control chars
747        | 0x2060..=0x2064 // Word joiner, invisible operators
748        | 0x2066..=0x206F // Bidi isolates
749        | 0x20D0..=0x20F0 // Combining marks for symbols
750        | 0xFE00..=0xFE0F // Variation Selectors
751        | 0xFE20..=0xFE2F // Combining Half Marks
752        | 0xFEFF          // Zero Width No-Break Space (BOM)
753        | 0xFFF9..=0xFFFB // Interlinear annotation anchors
754        | 0x1D167..=0x1D169
755        | 0x1D173..=0x1D182
756        | 0x1D185..=0x1D18B
757        | 0x1D1AA..=0x1D1AD
758        | 0x1D242..=0x1D244
759        | 0xE0001
760        | 0xE0020..=0xE007F
761        | 0xE0100..=0xE01EF // Variation Selectors Supplement
762    )
763}
764
765/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
766/// Matches glibc wcwidth() behavior for maximum GNU compatibility.
767#[inline]
768fn is_wide_char(cp: u32) -> bool {
769    matches!(
770        cp,
771        0x1100..=0x115F   // Hangul Jamo
772        | 0x231A..=0x231B // Watch, Hourglass
773        | 0x2329..=0x232A // Angle Brackets
774        | 0x23E9..=0x23F3 // Various symbols
775        | 0x23F8..=0x23FA
776        | 0x25FD..=0x25FE
777        | 0x2614..=0x2615
778        | 0x2648..=0x2653
779        | 0x267F
780        | 0x2693
781        | 0x26A1
782        | 0x26AA..=0x26AB
783        | 0x26BD..=0x26BE
784        | 0x26C4..=0x26C5
785        | 0x26CE
786        | 0x26D4
787        | 0x26EA
788        | 0x26F2..=0x26F3
789        | 0x26F5
790        | 0x26FA
791        | 0x26FD
792        | 0x2702
793        | 0x2705
794        | 0x2708..=0x270D
795        | 0x270F
796        | 0x2712
797        | 0x2714
798        | 0x2716
799        | 0x271D
800        | 0x2721
801        | 0x2728
802        | 0x2733..=0x2734
803        | 0x2744
804        | 0x2747
805        | 0x274C
806        | 0x274E
807        | 0x2753..=0x2755
808        | 0x2757
809        | 0x2763..=0x2764
810        | 0x2795..=0x2797
811        | 0x27A1
812        | 0x27B0
813        | 0x27BF
814        | 0x2934..=0x2935
815        | 0x2B05..=0x2B07
816        | 0x2B1B..=0x2B1C
817        | 0x2B50
818        | 0x2B55
819        | 0x2E80..=0x303E  // CJK Radicals, Kangxi Radicals, Ideographic Description
820        | 0x3040..=0x33BF  // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
821        | 0x3400..=0x4DBF  // CJK Unified Ideographs Extension A
822        | 0x4E00..=0xA4CF  // CJK Unified Ideographs, Yi
823        | 0xA960..=0xA97C  // Hangul Jamo Extended-A
824        | 0xAC00..=0xD7A3  // Hangul Syllables
825        | 0xF900..=0xFAFF  // CJK Compatibility Ideographs
826        | 0xFE10..=0xFE19  // Vertical Forms
827        | 0xFE30..=0xFE6F  // CJK Compatibility Forms
828        | 0xFF01..=0xFF60  // Fullwidth Latin, Halfwidth Katakana
829        | 0xFFE0..=0xFFE6  // Fullwidth Signs
830        | 0x1F004
831        | 0x1F0CF
832        | 0x1F170..=0x1F171
833        | 0x1F17E..=0x1F17F
834        | 0x1F18E
835        | 0x1F191..=0x1F19A
836        | 0x1F1E0..=0x1F1FF // Regional Indicators
837        | 0x1F200..=0x1F202
838        | 0x1F210..=0x1F23B
839        | 0x1F240..=0x1F248
840        | 0x1F250..=0x1F251
841        | 0x1F260..=0x1F265
842        | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
843        | 0x1F680..=0x1F6FF // Transport Symbols
844        | 0x1F900..=0x1F9FF // Supplemental Symbols
845        | 0x1FA00..=0x1FA6F
846        | 0x1FA70..=0x1FAFF
847        | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
848        | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
849    )
850}
851
852/// Compute maximum display width of any line (C/POSIX locale).
853///
854/// GNU wc -L behavior in C locale:
855/// - `\n`: line terminator (records max, resets position)
856/// - `\t`: advances to next tab stop (multiple of 8)
857/// - `\r`: carriage return (resets position to 0, same line)
858/// - `\f`: form feed (acts as line terminator like \n)
859/// - Printable ASCII (0x20..0x7E): width 1
860/// - Everything else (controls, high bytes): width 0
861pub fn max_line_length_c(data: &[u8]) -> u64 {
862    let mut max_len: u64 = 0;
863    let mut line_len: u64 = 0; // max position seen on current line
864    let mut linepos: u64 = 0; // current cursor position
865
866    for &b in data {
867        match b {
868            b'\n' => {
869                if line_len > max_len {
870                    max_len = line_len;
871                }
872                linepos = 0;
873                line_len = 0;
874            }
875            b'\t' => {
876                linepos = (linepos + 8) & !7;
877                if linepos > line_len {
878                    line_len = linepos;
879                }
880            }
881            b'\r' => {
882                linepos = 0;
883            }
884            0x0C => {
885                // Form feed: acts as line terminator
886                if line_len > max_len {
887                    max_len = line_len;
888                }
889                linepos = 0;
890                line_len = 0;
891            }
892            _ => {
893                if PRINTABLE_TABLE[b as usize] != 0 {
894                    linepos += 1;
895                    if linepos > line_len {
896                        line_len = linepos;
897                    }
898                }
899                // Non-printable: width 0
900            }
901        }
902    }
903
904    // Handle last line (may not end with \n)
905    if line_len > max_len {
906        max_len = line_len;
907    }
908
909    max_len
910}
911
912/// Compute maximum display width of any line (UTF-8 locale).
913///
914/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
915/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
916pub fn max_line_length_utf8(data: &[u8]) -> u64 {
917    let mut max_len: u64 = 0;
918    let mut line_len: u64 = 0;
919    let mut linepos: u64 = 0;
920    let mut i = 0;
921
922    while i < data.len() {
923        let b = data[i];
924
925        // Fast path for common ASCII
926        if b < 0x80 {
927            match b {
928                b'\n' => {
929                    if line_len > max_len {
930                        max_len = line_len;
931                    }
932                    linepos = 0;
933                    line_len = 0;
934                }
935                b'\t' => {
936                    linepos = (linepos + 8) & !7;
937                    if linepos > line_len {
938                        line_len = linepos;
939                    }
940                }
941                b'\r' => {
942                    linepos = 0;
943                }
944                0x0C => {
945                    // Form feed: line terminator
946                    if line_len > max_len {
947                        max_len = line_len;
948                    }
949                    linepos = 0;
950                    line_len = 0;
951                }
952                0x20..=0x7E => {
953                    // Printable ASCII
954                    linepos += 1;
955                    if linepos > line_len {
956                        line_len = linepos;
957                    }
958                }
959                _ => {
960                    // Non-printable ASCII control chars: width 0
961                }
962            }
963            i += 1;
964        } else {
965            // Multibyte UTF-8
966            let (cp, len) = decode_utf8(&data[i..]);
967
968            // C1 control characters (0x80..0x9F): non-printable, width 0
969            if cp <= 0x9F {
970                // width 0
971            } else if is_zero_width(cp) {
972                // Combining marks, zero-width chars: width 0
973            } else if is_wide_char(cp) {
974                linepos += 2;
975                if linepos > line_len {
976                    line_len = linepos;
977                }
978            } else {
979                // Regular printable Unicode character: width 1
980                linepos += 1;
981                if linepos > line_len {
982                    line_len = linepos;
983                }
984            }
985            i += len;
986        }
987    }
988
989    // Handle last line
990    if line_len > max_len {
991        max_len = line_len;
992    }
993
994    max_len
995}
996
997/// Compute maximum display width, choosing behavior based on locale.
998#[inline]
999pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1000    if utf8 {
1001        max_line_length_utf8(data)
1002    } else {
1003        max_line_length_c(data)
1004    }
1005}
1006
1007/// Count all metrics using optimized individual passes.
1008///
1009/// Each metric uses its own optimized algorithm:
1010/// - Lines: SIMD-accelerated memchr
1011/// - Words: 3-state scalar/state-machine (locale-dependent)
1012/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
1013/// - Max line length: locale-aware display width tracking
1014///
1015/// Multi-pass is faster than single-pass because each pass has a tight,
1016/// specialized loop. After the first pass, data is hot in L2/L3 cache,
1017/// making subsequent passes nearly free for memory bandwidth.
1018pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1019    if utf8 {
1020        let (lines, words) = count_lines_words_utf8_fused(data);
1021        WcCounts {
1022            lines,
1023            words,
1024            bytes: data.len() as u64,
1025            chars: count_chars_utf8(data),
1026            max_line_length: max_line_length_utf8(data),
1027        }
1028    } else {
1029        WcCounts {
1030            lines: count_lines(data),
1031            words: count_words_locale(data, false),
1032            bytes: data.len() as u64,
1033            chars: data.len() as u64,
1034            max_line_length: max_line_length_c(data),
1035        }
1036    }
1037}
1038
1039// ──────────────────────────────────────────────────
1040// Parallel counting for large files
1041// ──────────────────────────────────────────────────
1042
1043/// Count newlines in parallel using SIMD memchr + rayon.
1044pub fn count_lines_parallel(data: &[u8]) -> u64 {
1045    if data.len() < PARALLEL_THRESHOLD {
1046        return count_lines(data);
1047    }
1048
1049    let num_threads = rayon::current_num_threads().max(1);
1050    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1051
1052    data.par_chunks(chunk_size)
1053        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1054        .sum()
1055}
1056
1057/// Count words in parallel with boundary adjustment.
1058pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1059    if utf8 || data.len() < PARALLEL_THRESHOLD {
1060        // UTF-8: state machine can't be trivially parallelized
1061        // (multi-byte sequences may span chunk boundaries).
1062        return count_words_locale(data, utf8);
1063    }
1064
1065    // C locale: parallel 3-state word counting with boundary adjustment
1066    let num_threads = rayon::current_num_threads().max(1);
1067    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1068
1069    let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1070
1071    // Each chunk returns (word_count, first_active_is_printable, ends_in_word)
1072    let results: Vec<(u64, bool, bool)> = chunks
1073        .par_iter()
1074        .map(|chunk| count_words_c_chunk(chunk))
1075        .collect();
1076
1077    let mut total = 0u64;
1078    for i in 0..results.len() {
1079        total += results[i].0;
1080        // Boundary adjustment: if previous chunk ended in_word AND
1081        // current chunk's first non-transparent byte is printable,
1082        // the word was split across chunks — subtract the overcount.
1083        if i > 0 && results[i - 1].2 && results[i].1 {
1084            total -= 1;
1085        }
1086    }
1087    total
1088}
1089
1090/// Count UTF-8 characters in parallel.
1091pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1092    if !utf8 {
1093        return data.len() as u64;
1094    }
1095    if data.len() < PARALLEL_THRESHOLD {
1096        return count_chars_utf8(data);
1097    }
1098
1099    let num_threads = rayon::current_num_threads().max(1);
1100    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1101
1102    data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1103}
1104
1105/// Count lines + words + bytes in a single fused pass (the default wc mode).
1106/// Avoids separate passes entirely — combines newline counting with word detection.
1107pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1108    let (lines, words) = count_lines_words(data, utf8);
1109    (lines, words, data.len() as u64)
1110}
1111
1112/// Parallel counting of lines + words + bytes only (no chars).
1113/// Optimized for the default `wc` mode: avoids unnecessary char-counting pass.
1114pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1115    if data.len() < PARALLEL_THRESHOLD {
1116        // Small file: use fused single-pass
1117        return count_lwb(data, utf8);
1118    }
1119
1120    // Word counting must be sequential for UTF-8 (state machine across chunks)
1121    // But we use the fused lines+words approach to avoid a separate memchr pass
1122    let (lines, words) = if utf8 {
1123        count_lines_words_utf8_fused(data)
1124    } else {
1125        // C locale: parallel 3-state word counting with boundary adjustment
1126        let num_threads = rayon::current_num_threads().max(1);
1127        let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1128
1129        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1130        let results: Vec<(u64, bool, bool)> = chunks
1131            .par_iter()
1132            .map(|chunk| count_words_c_chunk(chunk))
1133            .collect();
1134
1135        let mut word_total = 0u64;
1136        for i in 0..results.len() {
1137            word_total += results[i].0;
1138            if i > 0 && results[i - 1].2 && results[i].1 {
1139                word_total -= 1;
1140            }
1141        }
1142
1143        let line_total: u64 = data
1144            .par_chunks(chunk_size)
1145            .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1146            .sum();
1147
1148        (line_total, word_total)
1149    };
1150
1151    (lines, words, data.len() as u64)
1152}
1153
1154/// Combined parallel counting of lines + words + chars.
1155pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1156    if data.len() < PARALLEL_THRESHOLD {
1157        let lines = count_lines(data);
1158        let words = count_words_locale(data, utf8);
1159        let chars = count_chars(data, utf8);
1160        return (lines, words, chars);
1161    }
1162
1163    // Word counting: sequential for UTF-8 (state machine), parallel for C locale
1164    let words = count_words_parallel(data, utf8);
1165
1166    // Lines and chars can always be parallelized safely
1167    let num_threads = rayon::current_num_threads().max(1);
1168    let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1169
1170    let lines: u64 = data
1171        .par_chunks(chunk_size)
1172        .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1173        .sum();
1174
1175    let chars = if utf8 {
1176        data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1177    } else {
1178        data.len() as u64
1179    };
1180
1181    (lines, words, chars)
1182}
coreutils_rs/wc/core.rs

coreutils_rs/wc/
core.rs