coreutils_rs/wc/core.rs
1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4/// Minimum data size to use parallel processing (2MB).
5/// Lower threshold lets us exploit 4 cores on smaller files.
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Results from counting a byte slice.
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
10pub struct WcCounts {
11 pub lines: u64,
12 pub words: u64,
13 pub bytes: u64,
14 pub chars: u64,
15 pub max_line_length: u64,
16}
17
18// ──────────────────────────────────────────────────
19// 3-state byte classification for word counting
20// ──────────────────────────────────────────────────
21//
22// GNU wc uses mbrtowc() + iswspace() + iswprint() with 3-state logic:
23// 0 = printable (word content): starts or continues a word
24// 1 = space (word break): ends any current word
25// 2 = transparent (unchanged): non-printable, non-space — does NOT change in_word
26//
27// The critical difference from 2-state is that transparent characters
28// (NUL, control chars, invalid UTF-8) do NOT break words.
29// Example: "hello\x00world" is 1 word (NUL is transparent), not 2.
30
31/// 3-state byte classification for C/POSIX locale.
32/// In C locale, mbrtowc() fails for bytes >= 0x80, making them transparent.
33/// Only printable ASCII (0x21-0x7E) forms words.
34const fn make_byte_class_c() -> [u8; 256] {
35 let mut t = [2u8; 256]; // default: transparent
36 // Spaces: iswspace() returns true
37 t[0x09] = 1; // \t
38 t[0x0A] = 1; // \n
39 t[0x0B] = 1; // \v
40 t[0x0C] = 1; // \f
41 t[0x0D] = 1; // \r
42 t[0x20] = 1; // space
43 // Printable ASCII (0x21-0x7E): word content
44 let mut i = 0x21u16;
45 while i <= 0x7E {
46 t[i as usize] = 0;
47 i += 1;
48 }
49 t
50}
51
52const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
53
54/// 3-state single-byte classification for UTF-8 locale.
55/// Multi-byte UTF-8 sequences are handled by the state machine separately.
56const fn make_byte_class_utf8() -> [u8; 256] {
57 let mut t = [2u8; 256]; // default: transparent
58 // Spaces
59 t[0x09] = 1; // \t
60 t[0x0A] = 1; // \n
61 t[0x0B] = 1; // \v
62 t[0x0C] = 1; // \f
63 t[0x0D] = 1; // \r
64 t[0x20] = 1; // space
65 // Printable ASCII (0x21-0x7E): word content
66 let mut i = 0x21u16;
67 while i <= 0x7E {
68 t[i as usize] = 0;
69 i += 1;
70 }
71 t
72}
73
74const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
75
76/// Printable ASCII lookup table: 0x20 (space) through 0x7E (~) are printable.
77const fn make_printable_table() -> [u8; 256] {
78 let mut t = [0u8; 256];
79 let mut i = 0x20u16;
80 while i <= 0x7E {
81 t[i as usize] = 1;
82 i += 1;
83 }
84 t
85}
86
87const PRINTABLE_TABLE: [u8; 256] = make_printable_table();
88
89// ──────────────────────────────────────────────────
90// Unicode character classification helpers
91// ──────────────────────────────────────────────────
92
93/// Check if a Unicode codepoint is a whitespace character (matching glibc iswspace).
94/// Only covers multi-byte Unicode spaces; ASCII spaces are handled by the byte table.
95#[inline]
96fn is_unicode_space(cp: u32) -> bool {
97 matches!(
98 cp,
99 0x00A0 | // No-Break Space
100 0x1680 | // Ogham Space Mark
101 0x2000
102 ..=0x200A | // En Quad through Hair Space
103 0x2028 | // Line Separator
104 0x2029 | // Paragraph Separator
105 0x202F | // Narrow No-Break Space
106 0x205F | // Medium Mathematical Space
107 0x3000 // Ideographic Space
108 )
109}
110
111/// Check if a Unicode codepoint (>= 0x80) is printable (matching glibc iswprint).
112/// C1 control characters (U+0080-U+009F) are not printable.
113/// Most characters >= U+00A0 are printable.
114#[inline]
115fn is_unicode_printable(cp: u32) -> bool {
116 cp >= 0xA0
117}
118
119// ──────────────────────────────────────────────────
120// Core counting functions
121// ──────────────────────────────────────────────────
122
123/// Count newlines using SIMD-accelerated memchr.
124/// GNU wc counts newline bytes (`\n`), not logical lines.
125#[inline]
126pub fn count_lines(data: &[u8]) -> u64 {
127 memchr_iter(b'\n', data).count() as u64
128}
129
130/// Count bytes. Trivial but included for API consistency.
131#[inline]
132pub fn count_bytes(data: &[u8]) -> u64 {
133 data.len() as u64
134}
135
136/// Count words using locale-aware 3-state logic (default: UTF-8).
137pub fn count_words(data: &[u8]) -> u64 {
138 count_words_locale(data, true)
139}
140
141/// Count words with explicit locale control using 3-state logic.
142///
143/// GNU wc classifies each character as:
144/// - space (iswspace=true): sets in_word=false
145/// - printable (iswprint=true): sets in_word=true, increments word count on transition
146/// - transparent (neither): leaves in_word unchanged
147pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
148 if utf8 {
149 count_words_utf8(data)
150 } else {
151 count_words_c(data)
152 }
153}
154
155/// Count words in C/POSIX locale using 3-state scalar logic.
156/// Only printable ASCII (0x21-0x7E) forms words.
157/// Bytes >= 0x80 and non-printable ASCII controls are transparent.
158fn count_words_c(data: &[u8]) -> u64 {
159 let mut words = 0u64;
160 let mut in_word = false;
161 for &b in data {
162 let class = BYTE_CLASS_C[b as usize];
163 if class == 1 {
164 // Space: break word
165 in_word = false;
166 } else if class == 0 {
167 // Printable: start/continue word
168 if !in_word {
169 in_word = true;
170 words += 1;
171 }
172 }
173 // class == 2: transparent — in_word unchanged
174 }
175 words
176}
177
178/// Count words in a C locale chunk, returning word count plus boundary info.
179/// Used by parallel word counting.
180/// Returns (word_count, first_active_is_printable, ends_in_word).
181fn count_words_c_chunk(data: &[u8]) -> (u64, bool, bool) {
182 let mut words = 0u64;
183 let mut in_word = false;
184 let mut first_active_is_printable = false;
185 let mut seen_active = false;
186
187 for &b in data {
188 let class = BYTE_CLASS_C[b as usize];
189 if class == 1 {
190 if !seen_active {
191 seen_active = true;
192 // first_active_is_printable stays false
193 }
194 in_word = false;
195 } else if class == 0 {
196 if !seen_active {
197 seen_active = true;
198 first_active_is_printable = true;
199 }
200 if !in_word {
201 in_word = true;
202 words += 1;
203 }
204 }
205 }
206 (words, first_active_is_printable, in_word)
207}
208
209/// Count words in UTF-8 locale using a state machine with 3-state logic.
210///
211/// Handles:
212/// - ASCII spaces (0x09-0x0D, 0x20): word break
213/// - ASCII printable (0x21-0x7E): word content
214/// - ASCII non-printable (0x00-0x08, 0x0E-0x1F, 0x7F): transparent
215/// - Valid UTF-8 multi-byte → check Unicode space/printable
216/// - Invalid UTF-8: transparent (GNU wc skips invalid bytes without changing state)
217fn count_words_utf8(data: &[u8]) -> u64 {
218 let mut words = 0u64;
219 let mut in_word = false;
220 let mut i = 0;
221
222 while i < data.len() {
223 let b = data[i];
224
225 if b < 0x80 {
226 // ASCII: use 3-state lookup table
227 let class = BYTE_CLASS_UTF8[b as usize];
228 if class == 1 {
229 in_word = false;
230 } else if class == 0 {
231 if !in_word {
232 in_word = true;
233 words += 1;
234 }
235 }
236 // class == 2: transparent
237 i += 1;
238 } else if b < 0xC2 {
239 // 0x80-0xBF: standalone continuation byte (invalid UTF-8)
240 // 0xC0-0xC1: overlong encoding (invalid UTF-8)
241 // Transparent: don't change in_word
242 i += 1;
243 } else if b < 0xE0 {
244 // 2-byte sequence: need 1 continuation byte
245 if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
246 let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
247 if is_unicode_space(cp) {
248 in_word = false;
249 } else if is_unicode_printable(cp) {
250 if !in_word {
251 in_word = true;
252 words += 1;
253 }
254 }
255 // else: non-printable (e.g., C1 controls U+0080-U+009F) → transparent
256 i += 2;
257 } else {
258 // Invalid sequence: transparent
259 i += 1;
260 }
261 } else if b < 0xF0 {
262 // 3-byte sequence: need 2 continuation bytes
263 if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
264 let cp = ((b as u32 & 0x0F) << 12)
265 | ((data[i + 1] as u32 & 0x3F) << 6)
266 | (data[i + 2] as u32 & 0x3F);
267 if is_unicode_space(cp) {
268 in_word = false;
269 } else if is_unicode_printable(cp) {
270 if !in_word {
271 in_word = true;
272 words += 1;
273 }
274 }
275 i += 3;
276 } else {
277 // Invalid: transparent
278 i += 1;
279 }
280 } else if b < 0xF5 {
281 // 4-byte sequence: need 3 continuation bytes
282 if i + 3 < data.len()
283 && (data[i + 1] & 0xC0) == 0x80
284 && (data[i + 2] & 0xC0) == 0x80
285 && (data[i + 3] & 0xC0) == 0x80
286 {
287 let cp = ((b as u32 & 0x07) << 18)
288 | ((data[i + 1] as u32 & 0x3F) << 12)
289 | ((data[i + 2] as u32 & 0x3F) << 6)
290 | (data[i + 3] as u32 & 0x3F);
291 if is_unicode_space(cp) {
292 in_word = false;
293 } else if is_unicode_printable(cp) {
294 if !in_word {
295 in_word = true;
296 words += 1;
297 }
298 }
299 i += 4;
300 } else {
301 // Invalid: transparent
302 i += 1;
303 }
304 } else {
305 // 0xF5-0xFF: invalid UTF-8 — transparent
306 i += 1;
307 }
308 }
309
310 words
311}
312
313/// Count lines and words using optimized strategies per locale.
314/// UTF-8: separate SIMD memchr + state machine passes.
315/// C locale: single scalar pass with 3-state logic.
316pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
317 if utf8 {
318 let lines = count_lines(data);
319 let words = count_words_utf8(data);
320 (lines, words)
321 } else {
322 let mut lines = 0u64;
323 let mut words = 0u64;
324 let mut in_word = false;
325 for &b in data {
326 if b == b'\n' {
327 lines += 1;
328 }
329 let class = BYTE_CLASS_C[b as usize];
330 if class == 1 {
331 in_word = false;
332 } else if class == 0 {
333 if !in_word {
334 in_word = true;
335 words += 1;
336 }
337 }
338 }
339 (lines, words)
340 }
341}
342
343/// Count lines, words, and chars using optimized strategies per locale.
344pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
345 if utf8 {
346 // Three separate optimized passes (data stays cache-hot between passes)
347 let lines = count_lines(data);
348 let words = count_words_utf8(data);
349 let chars = count_chars_utf8(data);
350 (lines, words, chars)
351 } else {
352 // C locale: single pass for lines + words, chars = byte count
353 let mut lines = 0u64;
354 let mut words = 0u64;
355 let mut in_word = false;
356 for &b in data {
357 if b == b'\n' {
358 lines += 1;
359 }
360 let class = BYTE_CLASS_C[b as usize];
361 if class == 1 {
362 in_word = false;
363 } else if class == 0 {
364 if !in_word {
365 in_word = true;
366 words += 1;
367 }
368 }
369 }
370 (lines, words, data.len() as u64)
371 }
372}
373
374/// Count UTF-8 characters by counting non-continuation bytes.
375/// A continuation byte has the bit pattern `10xxxxxx` (0x80..0xBF).
376/// Every other byte starts a new character (ASCII, multi-byte leader, or invalid).
377///
378/// Uses 64-byte block processing with popcount for ~4x throughput vs scalar.
379pub fn count_chars_utf8(data: &[u8]) -> u64 {
380 let mut count = 0u64;
381 let chunks = data.chunks_exact(64);
382 let remainder = chunks.remainder();
383
384 for chunk in chunks {
385 // Build 64-bit mask: bit i = 1 if chunk[i] is NOT a continuation byte
386 let mut char_mask = 0u64;
387 let mut i = 0;
388 while i + 7 < 64 {
389 char_mask |= (((chunk[i] & 0xC0) != 0x80) as u64) << i;
390 char_mask |= (((chunk[i + 1] & 0xC0) != 0x80) as u64) << (i + 1);
391 char_mask |= (((chunk[i + 2] & 0xC0) != 0x80) as u64) << (i + 2);
392 char_mask |= (((chunk[i + 3] & 0xC0) != 0x80) as u64) << (i + 3);
393 char_mask |= (((chunk[i + 4] & 0xC0) != 0x80) as u64) << (i + 4);
394 char_mask |= (((chunk[i + 5] & 0xC0) != 0x80) as u64) << (i + 5);
395 char_mask |= (((chunk[i + 6] & 0xC0) != 0x80) as u64) << (i + 6);
396 char_mask |= (((chunk[i + 7] & 0xC0) != 0x80) as u64) << (i + 7);
397 i += 8;
398 }
399 count += char_mask.count_ones() as u64;
400 }
401
402 for &b in remainder {
403 count += ((b & 0xC0) != 0x80) as u64;
404 }
405 count
406}
407
408/// Count characters in C/POSIX locale (each byte is one character).
409#[inline]
410pub fn count_chars_c(data: &[u8]) -> u64 {
411 data.len() as u64
412}
413
414/// Count characters, choosing behavior based on locale.
415#[inline]
416pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
417 if utf8 {
418 count_chars_utf8(data)
419 } else {
420 count_chars_c(data)
421 }
422}
423
424/// Detect if the current locale uses UTF-8 encoding.
425pub fn is_utf8_locale() -> bool {
426 for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
427 if let Ok(val) = std::env::var(var) {
428 if !val.is_empty() {
429 let lower = val.to_ascii_lowercase();
430 return lower.contains("utf-8") || lower.contains("utf8");
431 }
432 }
433 }
434 false
435}
436
437/// Decode one UTF-8 character from a byte slice.
438/// Returns (codepoint, byte_length). On invalid UTF-8, returns (byte as u32, 1).
439#[inline]
440fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
441 let b0 = bytes[0];
442 if b0 < 0x80 {
443 return (b0 as u32, 1);
444 }
445 if b0 < 0xC2 {
446 // Continuation byte or overlong 2-byte — invalid as start
447 return (b0 as u32, 1);
448 }
449 if b0 < 0xE0 {
450 if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
451 return (b0 as u32, 1);
452 }
453 let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
454 return (cp, 2);
455 }
456 if b0 < 0xF0 {
457 if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
458 return (b0 as u32, 1);
459 }
460 let cp =
461 ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
462 return (cp, 3);
463 }
464 if b0 < 0xF5 {
465 if bytes.len() < 4
466 || bytes[1] & 0xC0 != 0x80
467 || bytes[2] & 0xC0 != 0x80
468 || bytes[3] & 0xC0 != 0x80
469 {
470 return (b0 as u32, 1);
471 }
472 let cp = ((b0 as u32 & 0x07) << 18)
473 | ((bytes[1] as u32 & 0x3F) << 12)
474 | ((bytes[2] as u32 & 0x3F) << 6)
475 | (bytes[3] as u32 & 0x3F);
476 return (cp, 4);
477 }
478 (b0 as u32, 1)
479}
480
481/// Check if a Unicode codepoint is an East Asian Wide/Fullwidth character (display width 2).
482#[inline]
483fn is_wide_char(cp: u32) -> bool {
484 matches!(
485 cp,
486 0x1100..=0x115F // Hangul Jamo
487 | 0x231A..=0x231B // Watch, Hourglass
488 | 0x2329..=0x232A // Angle Brackets
489 | 0x23E9..=0x23F3 // Various symbols
490 | 0x23F8..=0x23FA
491 | 0x25FD..=0x25FE
492 | 0x2614..=0x2615
493 | 0x2648..=0x2653
494 | 0x267F
495 | 0x2693
496 | 0x26A1
497 | 0x26AA..=0x26AB
498 | 0x26BD..=0x26BE
499 | 0x26C4..=0x26C5
500 | 0x26CE
501 | 0x26D4
502 | 0x26EA
503 | 0x26F2..=0x26F3
504 | 0x26F5
505 | 0x26FA
506 | 0x26FD
507 | 0x2702
508 | 0x2705
509 | 0x2708..=0x270D
510 | 0x270F
511 | 0x2712
512 | 0x2714
513 | 0x2716
514 | 0x271D
515 | 0x2721
516 | 0x2728
517 | 0x2733..=0x2734
518 | 0x2744
519 | 0x2747
520 | 0x274C
521 | 0x274E
522 | 0x2753..=0x2755
523 | 0x2757
524 | 0x2763..=0x2764
525 | 0x2795..=0x2797
526 | 0x27A1
527 | 0x27B0
528 | 0x27BF
529 | 0x2934..=0x2935
530 | 0x2B05..=0x2B07
531 | 0x2B1B..=0x2B1C
532 | 0x2B50
533 | 0x2B55
534 | 0x2E80..=0x303E // CJK Radicals, Kangxi Radicals, Ideographic Description
535 | 0x3041..=0x33BF // Hiragana, Katakana, Bopomofo, Hangul Compat Jamo, Kanbun, CJK
536 | 0x3400..=0x4DBF // CJK Unified Ideographs Extension A
537 | 0x4E00..=0xA4CF // CJK Unified Ideographs, Yi
538 | 0xA960..=0xA97C // Hangul Jamo Extended-A
539 | 0xAC00..=0xD7A3 // Hangul Syllables
540 | 0xF900..=0xFAFF // CJK Compatibility Ideographs
541 | 0xFE10..=0xFE19 // Vertical Forms
542 | 0xFE30..=0xFE6F // CJK Compatibility Forms
543 | 0xFF01..=0xFF60 // Fullwidth Latin, Halfwidth Katakana
544 | 0xFFE0..=0xFFE6 // Fullwidth Signs
545 | 0x1F004
546 | 0x1F0CF
547 | 0x1F170..=0x1F171
548 | 0x1F17E..=0x1F17F
549 | 0x1F18E
550 | 0x1F191..=0x1F19A
551 | 0x1F1E0..=0x1F1FF // Regional Indicators
552 | 0x1F200..=0x1F202
553 | 0x1F210..=0x1F23B
554 | 0x1F240..=0x1F248
555 | 0x1F250..=0x1F251
556 | 0x1F260..=0x1F265
557 | 0x1F300..=0x1F64F // Misc Symbols, Emoticons
558 | 0x1F680..=0x1F6FF // Transport Symbols
559 | 0x1F900..=0x1F9FF // Supplemental Symbols
560 | 0x1FA00..=0x1FA6F
561 | 0x1FA70..=0x1FAFF
562 | 0x20000..=0x2FFFD // CJK Unified Ideographs Extension B-F
563 | 0x30000..=0x3FFFD // CJK Unified Ideographs Extension G
564 )
565}
566
567/// Compute maximum display width of any line (C/POSIX locale).
568///
569/// GNU wc -L behavior in C locale:
570/// - `\n`: line terminator (records max, resets position)
571/// - `\t`: advances to next tab stop (multiple of 8)
572/// - `\r`: carriage return (resets position to 0, same line)
573/// - `\f`: form feed (acts as line terminator like \n)
574/// - Printable ASCII (0x20..0x7E): width 1
575/// - Everything else (controls, high bytes): width 0
576pub fn max_line_length_c(data: &[u8]) -> u64 {
577 let mut max_len: u64 = 0;
578 let mut line_len: u64 = 0; // max position seen on current line
579 let mut linepos: u64 = 0; // current cursor position
580
581 for &b in data {
582 match b {
583 b'\n' => {
584 if line_len > max_len {
585 max_len = line_len;
586 }
587 linepos = 0;
588 line_len = 0;
589 }
590 b'\t' => {
591 linepos = (linepos + 8) & !7;
592 if linepos > line_len {
593 line_len = linepos;
594 }
595 }
596 b'\r' => {
597 linepos = 0;
598 }
599 0x0C => {
600 // Form feed: acts as line terminator
601 if line_len > max_len {
602 max_len = line_len;
603 }
604 linepos = 0;
605 line_len = 0;
606 }
607 _ => {
608 if PRINTABLE_TABLE[b as usize] != 0 {
609 linepos += 1;
610 if linepos > line_len {
611 line_len = linepos;
612 }
613 }
614 // Non-printable: width 0
615 }
616 }
617 }
618
619 // Handle last line (may not end with \n)
620 if line_len > max_len {
621 max_len = line_len;
622 }
623
624 max_len
625}
626
627/// Compute maximum display width of any line (UTF-8 locale).
628///
629/// GNU wc -L in UTF-8 locale uses mbrtowc() + wcwidth() for display width.
630/// East Asian Wide/Fullwidth characters get width 2, most others get width 1.
631pub fn max_line_length_utf8(data: &[u8]) -> u64 {
632 let mut max_len: u64 = 0;
633 let mut line_len: u64 = 0;
634 let mut linepos: u64 = 0;
635 let mut i = 0;
636
637 while i < data.len() {
638 let b = data[i];
639
640 // Fast path for common ASCII
641 if b < 0x80 {
642 match b {
643 b'\n' => {
644 if line_len > max_len {
645 max_len = line_len;
646 }
647 linepos = 0;
648 line_len = 0;
649 }
650 b'\t' => {
651 linepos = (linepos + 8) & !7;
652 if linepos > line_len {
653 line_len = linepos;
654 }
655 }
656 b'\r' => {
657 linepos = 0;
658 }
659 0x0C => {
660 // Form feed: line terminator
661 if line_len > max_len {
662 max_len = line_len;
663 }
664 linepos = 0;
665 line_len = 0;
666 }
667 0x20..=0x7E => {
668 // Printable ASCII
669 linepos += 1;
670 if linepos > line_len {
671 line_len = linepos;
672 }
673 }
674 _ => {
675 // Non-printable ASCII control chars: width 0
676 }
677 }
678 i += 1;
679 } else {
680 // Multibyte UTF-8
681 let (cp, len) = decode_utf8(&data[i..]);
682
683 // C1 control characters (0x80..0x9F): non-printable
684 if cp <= 0x9F {
685 // width 0
686 } else if is_wide_char(cp) {
687 linepos += 2;
688 if linepos > line_len {
689 line_len = linepos;
690 }
691 } else {
692 // Regular printable Unicode character: width 1
693 linepos += 1;
694 if linepos > line_len {
695 line_len = linepos;
696 }
697 }
698 i += len;
699 }
700 }
701
702 // Handle last line
703 if line_len > max_len {
704 max_len = line_len;
705 }
706
707 max_len
708}
709
710/// Compute maximum display width, choosing behavior based on locale.
711#[inline]
712pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
713 if utf8 {
714 max_line_length_utf8(data)
715 } else {
716 max_line_length_c(data)
717 }
718}
719
720/// Count all metrics using optimized individual passes.
721///
722/// Each metric uses its own optimized algorithm:
723/// - Lines: SIMD-accelerated memchr
724/// - Words: 3-state scalar/state-machine (locale-dependent)
725/// - Chars: non-continuation byte counting (UTF-8) or byte counting (C locale)
726/// - Max line length: locale-aware display width tracking
727///
728/// Multi-pass is faster than single-pass because each pass has a tight,
729/// specialized loop. After the first pass, data is hot in L2/L3 cache,
730/// making subsequent passes nearly free for memory bandwidth.
731pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
732 WcCounts {
733 lines: count_lines(data),
734 words: count_words_locale(data, utf8),
735 bytes: data.len() as u64,
736 chars: count_chars(data, utf8),
737 max_line_length: max_line_length(data, utf8),
738 }
739}
740
741// ──────────────────────────────────────────────────
742// Parallel counting for large files
743// ──────────────────────────────────────────────────
744
745/// Count newlines in parallel using SIMD memchr + rayon.
746pub fn count_lines_parallel(data: &[u8]) -> u64 {
747 if data.len() < PARALLEL_THRESHOLD {
748 return count_lines(data);
749 }
750
751 let num_threads = rayon::current_num_threads().max(1);
752 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
753
754 data.par_chunks(chunk_size)
755 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
756 .sum()
757}
758
759/// Count words in parallel with boundary adjustment.
760pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
761 if utf8 || data.len() < PARALLEL_THRESHOLD {
762 // UTF-8: state machine can't be trivially parallelized
763 // (multi-byte sequences may span chunk boundaries).
764 return count_words_locale(data, utf8);
765 }
766
767 // C locale: parallel 3-state word counting with boundary adjustment
768 let num_threads = rayon::current_num_threads().max(1);
769 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
770
771 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
772
773 // Each chunk returns (word_count, first_active_is_printable, ends_in_word)
774 let results: Vec<(u64, bool, bool)> = chunks
775 .par_iter()
776 .map(|chunk| count_words_c_chunk(chunk))
777 .collect();
778
779 let mut total = 0u64;
780 for i in 0..results.len() {
781 total += results[i].0;
782 // Boundary adjustment: if previous chunk ended in_word AND
783 // current chunk's first non-transparent byte is printable,
784 // the word was split across chunks — subtract the overcount.
785 if i > 0 && results[i - 1].2 && results[i].1 {
786 total -= 1;
787 }
788 }
789 total
790}
791
792/// Count UTF-8 characters in parallel.
793pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
794 if !utf8 {
795 return data.len() as u64;
796 }
797 if data.len() < PARALLEL_THRESHOLD {
798 return count_chars_utf8(data);
799 }
800
801 let num_threads = rayon::current_num_threads().max(1);
802 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
803
804 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
805}
806
807/// Combined parallel counting of lines + words + chars.
808pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
809 if data.len() < PARALLEL_THRESHOLD {
810 let lines = count_lines(data);
811 let words = count_words_locale(data, utf8);
812 let chars = count_chars(data, utf8);
813 return (lines, words, chars);
814 }
815
816 // Word counting: sequential for UTF-8 (state machine), parallel for C locale
817 let words = count_words_parallel(data, utf8);
818
819 // Lines and chars can always be parallelized safely
820 let num_threads = rayon::current_num_threads().max(1);
821 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
822
823 let lines: u64 = data
824 .par_chunks(chunk_size)
825 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
826 .sum();
827
828 let chars = if utf8 {
829 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
830 } else {
831 data.len() as u64
832 };
833
834 (lines, words, chars)
835}