1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12 pub lines: u64,
13 pub words: u64,
14 pub bytes: u64,
15 pub chars: u64,
16 pub max_line_length: u64,
17}
18
19const fn make_is_space() -> [bool; 256] {
46 let mut t = [false; 256];
47 t[0x09] = true; t[0x0A] = true; t[0x0B] = true; t[0x0C] = true; t[0x0D] = true; t[0x20] = true; t
54}
55const IS_SPACE: [bool; 256] = make_is_space();
56
57const fn make_is_print() -> [bool; 256] {
62 let mut t = [false; 256];
63 let mut i = 0x20u16;
64 while i <= 0x7E {
65 t[i as usize] = true;
66 i += 1;
67 }
68 t
69}
70const IS_PRINT: [bool; 256] = make_is_print();
71
72#[inline]
76pub(crate) fn first_is_word(data: &[u8]) -> bool {
77 !data.is_empty() && IS_PRINT[data[0] as usize] && !IS_SPACE[data[0] as usize]
78}
79
80#[inline]
87fn is_unicode_space(cp: u32) -> bool {
88 matches!(
89 cp,
90 0x1680 | 0x2000
92 ..=0x200A | 0x2028 | 0x2029 | 0x205F | 0x3000 )
98}
99
100#[inline]
104fn is_wnbspace(cp: u32) -> bool {
105 matches!(cp, 0x00A0 | 0x2007 | 0x202F | 0x2060)
106}
107
108#[inline]
110fn is_unicode_word_break(cp: u32) -> bool {
111 is_unicode_space(cp) || is_wnbspace(cp)
112}
113
114#[inline]
119fn is_printable_unicode(cp: u32) -> bool {
120 if cp < 0xA0 {
124 return false;
129 }
130 if (0xD800..=0xDFFF).contains(&cp) || cp > 0x10FFFF {
133 return false;
134 }
135 if (0xFDD0..=0xFDEF).contains(&cp) || (cp & 0xFFFE) == 0xFFFE {
137 return false;
138 }
139 true
140}
141
142#[inline]
149pub fn count_lines(data: &[u8]) -> u64 {
150 memchr_iter(b'\n', data).count() as u64
151}
152
153#[inline]
155pub fn count_bytes(data: &[u8]) -> u64 {
156 data.len() as u64
157}
158
159pub fn count_words(data: &[u8]) -> u64 {
161 count_words_locale(data, true)
162}
163
164pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
169 if utf8 {
170 count_words_utf8(data)
171 } else {
172 count_words_c(data)
173 }
174}
175
176fn count_words_c(data: &[u8]) -> u64 {
182 let mut words = 0u64;
183 let mut in_word = false;
184 let mut i = 0;
185 let len = data.len();
186
187 while i < len {
188 let b = unsafe { *data.get_unchecked(i) };
189 if IS_SPACE[b as usize] {
190 in_word = false;
191 } else if IS_PRINT[b as usize] {
192 if !in_word {
193 in_word = true;
194 words += 1;
195 }
196 }
197 i += 1;
199 }
200 words
201}
202
203#[cfg(target_arch = "x86_64")]
207#[inline(always)]
208fn count_lw_c_scalar_tail(
209 ptr: *const u8,
210 mut i: usize,
211 len: usize,
212 mut total_lines: u64,
213 mut total_words: u64,
214 mut prev_in_word: bool,
215 data: &[u8],
216) -> (u64, u64, bool, bool) {
217 while i < len {
220 let b = unsafe { *ptr.add(i) };
221 if IS_SPACE[b as usize] {
222 if b == b'\n' {
223 total_lines += 1;
224 }
225 prev_in_word = false;
226 } else if IS_PRINT[b as usize] && !prev_in_word {
227 total_words += 1;
228 prev_in_word = true;
229 }
230 i += 1;
232 }
233 let first_word = first_is_word(data);
234 (total_lines, total_words, first_word, prev_in_word)
235}
236
237#[cfg(target_arch = "x86_64")]
246#[target_feature(enable = "avx2")]
247unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
248 use std::arch::x86_64::*;
249
250 let len = data.len();
251 let ptr = data.as_ptr();
252 let mut i = 0usize;
253 let mut total_lines = 0u64;
254 let mut total_words = 0u64;
255 let mut prev_in_word = false;
256
257 unsafe {
258 let nl_byte = _mm256_set1_epi8(b'\n' as i8);
259 let zero = _mm256_setzero_si256();
260 let ones = _mm256_set1_epi8(1);
261 let const_0x09 = _mm256_set1_epi8(0x09u8 as i8);
263 let const_0x0d = _mm256_set1_epi8(0x0Du8 as i8);
264 let const_0x20 = _mm256_set1_epi8(0x20u8 as i8);
265 let const_0x21 = _mm256_set1_epi8(0x21u8 as i8);
267 let const_0x7e = _mm256_set1_epi8(0x7Eu8 as i8);
268
269 let mut line_acc = _mm256_setzero_si256();
270 let mut batch = 0u32;
271
272 while i + 32 <= len {
273 let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
274 let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
275 line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
276
277 let ge_09 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x09), v);
279 let le_0d = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x0d), v);
280 let in_tab_range = _mm256_and_si256(ge_09, le_0d);
281 let is_sp = _mm256_cmpeq_epi8(v, const_0x20);
282 let is_space = _mm256_or_si256(in_tab_range, is_sp);
283 let space_mask = _mm256_movemask_epi8(is_space) as u32;
284
285 let ge_21 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x21), v);
287 let le_7e = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x7e), v);
288 let is_print = _mm256_and_si256(ge_21, le_7e);
289 let print_mask = _mm256_movemask_epi8(is_print) as u32;
290
291 let transparent_mask = !(space_mask | print_mask);
292 if transparent_mask == 0 {
293 let prev_space = (space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 };
295 let starts = print_mask & prev_space;
296 total_words += starts.count_ones() as u64;
297 prev_in_word = (print_mask >> 31) & 1 == 1;
298 } else {
299 let end = (i + 32).min(len);
301 for j in i..end {
302 let b = *ptr.add(j);
303 if IS_SPACE[b as usize] {
304 prev_in_word = false;
305 } else if IS_PRINT[b as usize] && !prev_in_word {
306 total_words += 1;
307 prev_in_word = true;
308 }
309 }
310 }
311
312 batch += 1;
313 if batch >= 255 {
314 let sad = _mm256_sad_epu8(line_acc, zero);
315 let hi = _mm256_extracti128_si256(sad, 1);
316 let lo = _mm256_castsi256_si128(sad);
317 let s = _mm_add_epi64(lo, hi);
318 let h64 = _mm_unpackhi_epi64(s, s);
319 let t = _mm_add_epi64(s, h64);
320 total_lines += _mm_cvtsi128_si64(t) as u64;
321 line_acc = _mm256_setzero_si256();
322 batch = 0;
323 }
324 i += 32;
325 }
326
327 if batch > 0 {
328 let sad = _mm256_sad_epu8(line_acc, zero);
329 let hi = _mm256_extracti128_si256(sad, 1);
330 let lo = _mm256_castsi256_si128(sad);
331 let s = _mm_add_epi64(lo, hi);
332 let h64 = _mm_unpackhi_epi64(s, s);
333 let t = _mm_add_epi64(s, h64);
334 total_lines += _mm_cvtsi128_si64(t) as u64;
335 }
336 }
337
338 count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
339}
340
341#[cfg(target_arch = "x86_64")]
344#[target_feature(enable = "sse2")]
345unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
346 use std::arch::x86_64::*;
347
348 let len = data.len();
349 let ptr = data.as_ptr();
350 let mut i = 0usize;
351 let mut total_lines = 0u64;
352 let mut total_words = 0u64;
353 let mut prev_in_word = false;
354
355 unsafe {
356 let nl_byte = _mm_set1_epi8(b'\n' as i8);
357 let zero = _mm_setzero_si128();
358 let ones = _mm_set1_epi8(1);
359 let const_0x09 = _mm_set1_epi8(0x09u8 as i8);
360 let const_0x0d = _mm_set1_epi8(0x0Du8 as i8);
361 let const_0x20 = _mm_set1_epi8(0x20u8 as i8);
362 let const_0x21 = _mm_set1_epi8(0x21u8 as i8);
363 let const_0x7e = _mm_set1_epi8(0x7Eu8 as i8);
364
365 let mut line_acc = _mm_setzero_si128();
366 let mut batch = 0u32;
367
368 while i + 16 <= len {
369 let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
370 let is_nl = _mm_cmpeq_epi8(v, nl_byte);
371 line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
372
373 let ge_09 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x09), v);
375 let le_0d = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x0d), v);
376 let in_tab_range = _mm_and_si128(ge_09, le_0d);
377 let is_sp = _mm_cmpeq_epi8(v, const_0x20);
378 let is_space = _mm_or_si128(in_tab_range, is_sp);
379 let space_mask = (_mm_movemask_epi8(is_space) as u32) & 0xFFFF;
380
381 let ge_21 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x21), v);
383 let le_7e = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x7e), v);
384 let is_print = _mm_and_si128(ge_21, le_7e);
385 let print_mask = (_mm_movemask_epi8(is_print) as u32) & 0xFFFF;
386
387 let transparent_mask = !(space_mask | print_mask) & 0xFFFF;
388 if transparent_mask == 0 {
389 let prev_space =
390 ((space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 }) & 0xFFFF;
391 let starts = print_mask & prev_space;
392 total_words += starts.count_ones() as u64;
393 prev_in_word = (print_mask >> 15) & 1 == 1;
394 } else {
395 let end = (i + 16).min(len);
396 for j in i..end {
397 let b = *ptr.add(j);
398 if IS_SPACE[b as usize] {
399 prev_in_word = false;
400 } else if IS_PRINT[b as usize] && !prev_in_word {
401 total_words += 1;
402 prev_in_word = true;
403 }
404 }
405 }
406
407 batch += 1;
408 if batch >= 255 {
409 let sad = _mm_sad_epu8(line_acc, zero);
410 let hi = _mm_unpackhi_epi64(sad, sad);
411 let t = _mm_add_epi64(sad, hi);
412 total_lines += _mm_cvtsi128_si64(t) as u64;
413 line_acc = _mm_setzero_si128();
414 batch = 0;
415 }
416 i += 16;
417 }
418
419 if batch > 0 {
420 let sad = _mm_sad_epu8(line_acc, zero);
421 let hi = _mm_unpackhi_epi64(sad, sad);
422 let t = _mm_add_epi64(sad, hi);
423 total_lines += _mm_cvtsi128_si64(t) as u64;
424 }
425 }
426
427 count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
428}
429
430#[inline]
432fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
433 #[cfg(target_arch = "x86_64")]
434 {
435 if is_x86_feature_detected!("avx2") && data.len() >= 64 {
436 return unsafe { count_lw_c_chunk_avx2(data) };
437 }
438 if data.len() >= 32 {
439 return unsafe { count_lw_c_chunk_sse2(data) };
440 }
441 }
442 count_lw_c_chunk(data)
443}
444
445fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
449 let mut lines = 0u64;
450 let mut words = 0u64;
451 let mut in_word = false;
452 let mut i = 0;
453 let len = data.len();
454
455 let first_word = first_is_word(data);
456
457 while i < len {
460 let b = unsafe { *data.get_unchecked(i) };
461 if IS_SPACE[b as usize] {
462 if b == b'\n' {
463 lines += 1;
464 }
465 in_word = false;
466 } else if IS_PRINT[b as usize] {
467 if !in_word {
468 in_word = true;
469 words += 1;
470 }
471 }
472 i += 1;
474 }
475 (lines, words, first_word, in_word)
476}
477
478fn count_words_utf8(data: &[u8]) -> u64 {
490 let mut words = 0u64;
491 let mut in_word = false;
492 let mut i = 0;
493 let len = data.len();
494
495 while i < len {
496 let b = unsafe { *data.get_unchecked(i) };
497
498 if b < 0x80 {
499 if IS_SPACE[b as usize] {
504 in_word = false;
505 } else if b >= 0x21 && b <= 0x7E {
506 if !in_word {
508 in_word = true;
509 words += 1;
510 }
511 }
512 i += 1;
514 } else if b < 0xC2 {
515 i += 1;
518 } else if b < 0xE0 {
519 if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
520 let cp = ((b as u32 & 0x1F) << 6)
521 | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
522 if is_unicode_word_break(cp) {
523 in_word = false;
524 } else if is_printable_unicode(cp) {
525 if !in_word {
526 in_word = true;
527 words += 1;
528 }
529 }
530 i += 2;
532 } else {
533 i += 1;
535 }
536 } else if b < 0xF0 {
537 if i + 2 < len
538 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
539 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
540 {
541 let cp = ((b as u32 & 0x0F) << 12)
542 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
543 | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
544 if is_unicode_word_break(cp) {
545 in_word = false;
546 } else if is_printable_unicode(cp) {
547 if !in_word {
548 in_word = true;
549 words += 1;
550 }
551 }
552 i += 3;
554 } else {
555 i += 1;
557 }
558 } else if b < 0xF5 {
559 if i + 3 < len
560 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
561 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
562 && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
563 {
564 let cp = ((b as u32 & 0x07) << 18)
565 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
566 | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
567 | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
568 if is_unicode_word_break(cp) {
569 in_word = false;
570 } else if is_printable_unicode(cp) {
571 if !in_word {
572 in_word = true;
573 words += 1;
574 }
575 }
576 i += 4;
578 } else {
579 i += 1;
581 }
582 } else {
583 i += 1;
585 }
586 }
587
588 words
589}
590
591pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
595 if utf8 {
596 count_lines_words_utf8_fused(data)
597 } else {
598 let (lines, words, _, _) = count_lw_c_chunk_fast(data);
599 (lines, words)
600 }
601}
602
603fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
611 let mut lines = 0u64;
612 let mut words = 0u64;
613 let mut in_word = false;
614 let mut i = 0;
615 let len = data.len();
616
617 while i < len {
618 let b = unsafe { *data.get_unchecked(i) };
619
620 if b == b'\n' {
621 lines += 1;
622 in_word = false;
623 i += 1;
624 } else if b < 0x80 {
625 if IS_SPACE[b as usize] {
629 in_word = false;
630 } else if b >= 0x21 && b <= 0x7E {
631 if !in_word {
632 in_word = true;
633 words += 1;
634 }
635 }
636 i += 1;
638 } else if b < 0xC2 {
639 i += 1;
641 } else if b < 0xE0 {
642 if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
643 let cp = ((b as u32 & 0x1F) << 6)
644 | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
645 if is_unicode_word_break(cp) {
646 in_word = false;
647 } else if is_printable_unicode(cp) {
648 if !in_word {
649 in_word = true;
650 words += 1;
651 }
652 }
653 i += 2;
654 } else {
655 i += 1;
657 }
658 } else if b < 0xF0 {
659 if i + 2 < len
660 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
661 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
662 {
663 let cp = ((b as u32 & 0x0F) << 12)
664 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
665 | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
666 if is_unicode_word_break(cp) {
667 in_word = false;
668 } else if is_printable_unicode(cp) {
669 if !in_word {
670 in_word = true;
671 words += 1;
672 }
673 }
674 i += 3;
675 } else {
676 i += 1;
678 }
679 } else if b < 0xF5 {
680 if i + 3 < len
681 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
682 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
683 && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
684 {
685 let cp = ((b as u32 & 0x07) << 18)
686 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
687 | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
688 | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
689 if is_unicode_word_break(cp) {
690 in_word = false;
691 } else if is_printable_unicode(cp) {
692 if !in_word {
693 in_word = true;
694 words += 1;
695 }
696 }
697 i += 4;
698 } else {
699 i += 1;
701 }
702 } else {
703 i += 1;
705 }
706 }
707
708 (lines, words)
709}
710
711pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
713 if utf8 {
714 let (lines, words) = count_lines_words_utf8_fused(data);
716 let chars = count_chars_utf8(data);
717 (lines, words, chars)
718 } else {
719 let (lines, words) = count_lines_words(data, false);
721 (lines, words, data.len() as u64)
722 }
723}
724
725pub fn count_chars_utf8(data: &[u8]) -> u64 {
732 #[cfg(target_arch = "x86_64")]
733 {
734 if is_x86_feature_detected!("avx2") {
735 return unsafe { count_chars_utf8_avx2(data) };
736 }
737 }
738 count_chars_utf8_scalar(data)
739}
740
741#[cfg(target_arch = "x86_64")]
745#[target_feature(enable = "avx2")]
746unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
747 unsafe {
748 use std::arch::x86_64::*;
749
750 let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
751 let val_80 = _mm256_set1_epi8(0x80u8 as i8);
752 let ones = _mm256_set1_epi8(1);
753 let zero = _mm256_setzero_si256();
754
755 let mut total = 0u64;
756 let len = data.len();
757 let ptr = data.as_ptr();
758 let mut i = 0;
759 let mut acc = _mm256_setzero_si256();
760 let mut batch = 0u32;
761
762 while i + 32 <= len {
763 let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
764 let masked = _mm256_and_si256(v, mask_c0);
765 let is_cont = _mm256_cmpeq_epi8(masked, val_80);
766 let non_cont = _mm256_andnot_si256(is_cont, ones);
767 acc = _mm256_add_epi8(acc, non_cont);
768
769 batch += 1;
770 if batch >= 255 {
771 let sad = _mm256_sad_epu8(acc, zero);
773 let hi = _mm256_extracti128_si256(sad, 1);
774 let lo = _mm256_castsi256_si128(sad);
775 let sum = _mm_add_epi64(lo, hi);
776 let hi64 = _mm_unpackhi_epi64(sum, sum);
777 let t = _mm_add_epi64(sum, hi64);
778 total += _mm_cvtsi128_si64(t) as u64;
779 acc = _mm256_setzero_si256();
780 batch = 0;
781 }
782 i += 32;
783 }
784
785 if batch > 0 {
787 let sad = _mm256_sad_epu8(acc, zero);
788 let hi = _mm256_extracti128_si256(sad, 1);
789 let lo = _mm256_castsi256_si128(sad);
790 let sum = _mm_add_epi64(lo, hi);
791 let hi64 = _mm_unpackhi_epi64(sum, sum);
792 let t = _mm_add_epi64(sum, hi64);
793 total += _mm_cvtsi128_si64(t) as u64;
794 }
795
796 while i < len {
797 total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
798 i += 1;
799 }
800
801 total
802 }
803}
804
805fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
807 let mut count = 0u64;
808 let chunks = data.chunks_exact(64);
809 let remainder = chunks.remainder();
810
811 for chunk in chunks {
812 let mut any_high = 0u8;
814 let mut i = 0;
815 while i + 8 <= 64 {
816 unsafe {
817 any_high |= *chunk.get_unchecked(i);
818 any_high |= *chunk.get_unchecked(i + 1);
819 any_high |= *chunk.get_unchecked(i + 2);
820 any_high |= *chunk.get_unchecked(i + 3);
821 any_high |= *chunk.get_unchecked(i + 4);
822 any_high |= *chunk.get_unchecked(i + 5);
823 any_high |= *chunk.get_unchecked(i + 6);
824 any_high |= *chunk.get_unchecked(i + 7);
825 }
826 i += 8;
827 }
828 if any_high < 0x80 {
829 count += 64;
830 continue;
831 }
832
833 let mut char_mask = 0u64;
834 i = 0;
835 while i + 7 < 64 {
836 unsafe {
837 char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
838 char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
839 char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
840 char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
841 char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
842 char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
843 char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
844 char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
845 }
846 i += 8;
847 }
848 count += char_mask.count_ones() as u64;
849 }
850
851 for &b in remainder {
852 count += ((b & 0xC0) != 0x80) as u64;
853 }
854 count
855}
856
857#[inline]
859pub fn count_chars_c(data: &[u8]) -> u64 {
860 data.len() as u64
861}
862
863#[inline]
865pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
866 if utf8 {
867 count_chars_utf8(data)
868 } else {
869 count_chars_c(data)
870 }
871}
872
873pub fn is_utf8_locale() -> bool {
875 for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
876 if let Ok(val) = std::env::var(var) {
877 if !val.is_empty() {
878 let lower = val.to_ascii_lowercase();
879 return lower.contains("utf-8") || lower.contains("utf8");
880 }
881 }
882 }
883 false
884}
885
886#[inline]
889fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
890 let b0 = bytes[0];
891 if b0 < 0x80 {
892 return (b0 as u32, 1);
893 }
894 if b0 < 0xC2 {
895 return (b0 as u32, 1);
897 }
898 if b0 < 0xE0 {
899 if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
900 return (b0 as u32, 1);
901 }
902 let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
903 return (cp, 2);
904 }
905 if b0 < 0xF0 {
906 if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
907 return (b0 as u32, 1);
908 }
909 let cp =
910 ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
911 return (cp, 3);
912 }
913 if b0 < 0xF5 {
914 if bytes.len() < 4
915 || bytes[1] & 0xC0 != 0x80
916 || bytes[2] & 0xC0 != 0x80
917 || bytes[3] & 0xC0 != 0x80
918 {
919 return (b0 as u32, 1);
920 }
921 let cp = ((b0 as u32 & 0x07) << 18)
922 | ((bytes[1] as u32 & 0x3F) << 12)
923 | ((bytes[2] as u32 & 0x3F) << 6)
924 | (bytes[3] as u32 & 0x3F);
925 return (cp, 4);
926 }
927 (b0 as u32, 1)
928}
929
930#[inline]
933fn is_zero_width(cp: u32) -> bool {
934 matches!(
935 cp,
936 0x0300..=0x036F | 0x0483..=0x0489 | 0x0591..=0x05BD | 0x05BF
940 | 0x05C1..=0x05C2
941 | 0x05C4..=0x05C5
942 | 0x05C7
943 | 0x0600..=0x0605 | 0x0610..=0x061A | 0x064B..=0x065F | 0x0670
947 | 0x06D6..=0x06DD
948 | 0x06DF..=0x06E4
949 | 0x06E7..=0x06E8
950 | 0x06EA..=0x06ED
951 | 0x070F
952 | 0x0711
953 | 0x0730..=0x074A
954 | 0x07A6..=0x07B0
955 | 0x07EB..=0x07F3
956 | 0x07FD
957 | 0x0816..=0x0819
958 | 0x081B..=0x0823
959 | 0x0825..=0x0827
960 | 0x0829..=0x082D
961 | 0x0859..=0x085B
962 | 0x08D3..=0x08E1
963 | 0x08E3..=0x0902
964 | 0x093A
965 | 0x093C
966 | 0x0941..=0x0948
967 | 0x094D
968 | 0x0951..=0x0957
969 | 0x0962..=0x0963
970 | 0x0981
971 | 0x09BC
972 | 0x09C1..=0x09C4
973 | 0x09CD
974 | 0x09E2..=0x09E3
975 | 0x09FE
976 | 0x0A01..=0x0A02
977 | 0x0A3C
978 | 0x0A41..=0x0A42
979 | 0x0A47..=0x0A48
980 | 0x0A4B..=0x0A4D
981 | 0x0A51
982 | 0x0A70..=0x0A71
983 | 0x0A75
984 | 0x0A81..=0x0A82
985 | 0x0ABC
986 | 0x0AC1..=0x0AC5
987 | 0x0AC7..=0x0AC8
988 | 0x0ACD
989 | 0x0AE2..=0x0AE3
990 | 0x0AFA..=0x0AFF
991 | 0x0B01
992 | 0x0B3C
993 | 0x0B3F
994 | 0x0B41..=0x0B44
995 | 0x0B4D
996 | 0x0B56
997 | 0x0B62..=0x0B63
998 | 0x0B82
999 | 0x0BC0
1000 | 0x0BCD
1001 | 0x0C00
1002 | 0x0C04
1003 | 0x0C3E..=0x0C40
1004 | 0x0C46..=0x0C48
1005 | 0x0C4A..=0x0C4D
1006 | 0x0C55..=0x0C56
1007 | 0x0C62..=0x0C63
1008 | 0x0C81
1009 | 0x0CBC
1010 | 0x0CBF
1011 | 0x0CC6
1012 | 0x0CCC..=0x0CCD
1013 | 0x0CE2..=0x0CE3
1014 | 0x0D00..=0x0D01
1015 | 0x0D3B..=0x0D3C
1016 | 0x0D41..=0x0D44
1017 | 0x0D4D
1018 | 0x0D62..=0x0D63
1019 | 0x0DCA
1020 | 0x0DD2..=0x0DD4
1021 | 0x0DD6
1022 | 0x0E31
1023 | 0x0E34..=0x0E3A
1024 | 0x0E47..=0x0E4E
1025 | 0x0EB1
1026 | 0x0EB4..=0x0EBC
1027 | 0x0EC8..=0x0ECD
1028 | 0x0F18..=0x0F19
1029 | 0x0F35
1030 | 0x0F37
1031 | 0x0F39
1032 | 0x0F71..=0x0F7E
1033 | 0x0F80..=0x0F84
1034 | 0x0F86..=0x0F87
1035 | 0x0F8D..=0x0F97
1036 | 0x0F99..=0x0FBC
1037 | 0x0FC6
1038 | 0x102D..=0x1030
1039 | 0x1032..=0x1037
1040 | 0x1039..=0x103A
1041 | 0x103D..=0x103E
1042 | 0x1058..=0x1059
1043 | 0x105E..=0x1060
1044 | 0x1071..=0x1074
1045 | 0x1082
1046 | 0x1085..=0x1086
1047 | 0x108D
1048 | 0x109D
1049 | 0x1160..=0x11FF | 0x135D..=0x135F
1051 | 0x1712..=0x1714
1052 | 0x1732..=0x1734
1053 | 0x1752..=0x1753
1054 | 0x1772..=0x1773
1055 | 0x17B4..=0x17B5
1056 | 0x17B7..=0x17BD
1057 | 0x17C6
1058 | 0x17C9..=0x17D3
1059 | 0x17DD
1060 | 0x180B..=0x180D
1061 | 0x1885..=0x1886
1062 | 0x18A9
1063 | 0x1920..=0x1922
1064 | 0x1927..=0x1928
1065 | 0x1932
1066 | 0x1939..=0x193B
1067 | 0x1A17..=0x1A18
1068 | 0x1A1B
1069 | 0x1A56
1070 | 0x1A58..=0x1A5E
1071 | 0x1A60
1072 | 0x1A62
1073 | 0x1A65..=0x1A6C
1074 | 0x1A73..=0x1A7C
1075 | 0x1A7F
1076 | 0x1AB0..=0x1ABE
1077 | 0x1B00..=0x1B03
1078 | 0x1B34
1079 | 0x1B36..=0x1B3A
1080 | 0x1B3C
1081 | 0x1B42
1082 | 0x1B6B..=0x1B73
1083 | 0x1B80..=0x1B81
1084 | 0x1BA2..=0x1BA5
1085 | 0x1BA8..=0x1BA9
1086 | 0x1BAB..=0x1BAD
1087 | 0x1BE6
1088 | 0x1BE8..=0x1BE9
1089 | 0x1BED
1090 | 0x1BEF..=0x1BF1
1091 | 0x1C2C..=0x1C33
1092 | 0x1C36..=0x1C37
1093 | 0x1CD0..=0x1CD2
1094 | 0x1CD4..=0x1CE0
1095 | 0x1CE2..=0x1CE8
1096 | 0x1CED
1097 | 0x1CF4
1098 | 0x1CF8..=0x1CF9
1099 | 0x1DC0..=0x1DF9
1100 | 0x1DFB..=0x1DFF
1101 | 0x200B..=0x200F | 0x202A..=0x202E | 0x2060..=0x2064 | 0x2066..=0x206F | 0x20D0..=0x20F0 | 0xFE00..=0xFE0F | 0xFE20..=0xFE2F | 0xFEFF | 0xFFF9..=0xFFFB | 0x1D167..=0x1D169
1111 | 0x1D173..=0x1D182
1112 | 0x1D185..=0x1D18B
1113 | 0x1D1AA..=0x1D1AD
1114 | 0x1D242..=0x1D244
1115 | 0xE0001
1116 | 0xE0020..=0xE007F
1117 | 0xE0100..=0xE01EF )
1119}
1120
1121#[inline]
1124fn is_wide_char(cp: u32) -> bool {
1125 matches!(
1126 cp,
1127 0x1100..=0x115F | 0x231A..=0x231B | 0x2329..=0x232A | 0x23E9..=0x23F3 | 0x23F8..=0x23FA
1132 | 0x25FD..=0x25FE
1133 | 0x2614..=0x2615
1134 | 0x2648..=0x2653
1135 | 0x267F
1136 | 0x2693
1137 | 0x26A1
1138 | 0x26AA..=0x26AB
1139 | 0x26BD..=0x26BE
1140 | 0x26C4..=0x26C5
1141 | 0x26CE
1142 | 0x26D4
1143 | 0x26EA
1144 | 0x26F2..=0x26F3
1145 | 0x26F5
1146 | 0x26FA
1147 | 0x26FD
1148 | 0x2702
1149 | 0x2705
1150 | 0x2708..=0x270D
1151 | 0x270F
1152 | 0x2712
1153 | 0x2714
1154 | 0x2716
1155 | 0x271D
1156 | 0x2721
1157 | 0x2728
1158 | 0x2733..=0x2734
1159 | 0x2744
1160 | 0x2747
1161 | 0x274C
1162 | 0x274E
1163 | 0x2753..=0x2755
1164 | 0x2757
1165 | 0x2763..=0x2764
1166 | 0x2795..=0x2797
1167 | 0x27A1
1168 | 0x27B0
1169 | 0x27BF
1170 | 0x2934..=0x2935
1171 | 0x2B05..=0x2B07
1172 | 0x2B1B..=0x2B1C
1173 | 0x2B50
1174 | 0x2B55
1175 | 0x2E80..=0x303E | 0x3040..=0x33BF | 0x3400..=0x4DBF | 0x4E00..=0xA4CF | 0xA960..=0xA97C | 0xAC00..=0xD7A3 | 0xF900..=0xFAFF | 0xFE10..=0xFE19 | 0xFE30..=0xFE6F | 0xFF01..=0xFF60 | 0xFFE0..=0xFFE6 | 0x1F004
1187 | 0x1F0CF
1188 | 0x1F170..=0x1F171
1189 | 0x1F17E..=0x1F17F
1190 | 0x1F18E
1191 | 0x1F191..=0x1F19A
1192 | 0x1F1E0..=0x1F1FF | 0x1F200..=0x1F202
1194 | 0x1F210..=0x1F23B
1195 | 0x1F240..=0x1F248
1196 | 0x1F250..=0x1F251
1197 | 0x1F260..=0x1F265
1198 | 0x1F300..=0x1F64F | 0x1F680..=0x1F6FF | 0x1F900..=0x1F9FF | 0x1FA00..=0x1FA6F
1202 | 0x1FA70..=0x1FAFF
1203 | 0x20000..=0x2FFFD | 0x30000..=0x3FFFD )
1206}
1207
1208pub fn max_line_length_c(data: &[u8]) -> u64 {
1221 let mut max_len: u64 = 0;
1222 let mut line_len: u64 = 0;
1223 let mut linepos: u64 = 0;
1224 let mut i = 0;
1225 let len = data.len();
1226
1227 while i < len {
1228 let b = unsafe { *data.get_unchecked(i) };
1229 if b >= 0x21 && b <= 0x7E {
1230 i += 1;
1232 let mut run = 1u64;
1233 while i < len {
1234 let b = unsafe { *data.get_unchecked(i) };
1235 if b >= 0x21 && b <= 0x7E {
1236 run += 1;
1237 i += 1;
1238 } else {
1239 break;
1240 }
1241 }
1242 linepos += run;
1243 if linepos > line_len {
1244 line_len = linepos;
1245 }
1246 } else {
1247 match b {
1248 b' ' => {
1249 linepos += 1;
1250 if linepos > line_len {
1251 line_len = linepos;
1252 }
1253 }
1254 b'\n' => {
1255 if line_len > max_len {
1256 max_len = line_len;
1257 }
1258 linepos = 0;
1259 line_len = 0;
1260 }
1261 b'\t' => {
1262 linepos = (linepos + 8) & !7;
1263 if linepos > line_len {
1264 line_len = linepos;
1265 }
1266 }
1267 b'\r' => {
1268 linepos = 0;
1269 }
1270 0x0C => {
1271 if line_len > max_len {
1272 max_len = line_len;
1273 }
1274 linepos = 0;
1275 line_len = 0;
1276 }
1277 _ => {} }
1279 i += 1;
1280 }
1281 }
1282
1283 if line_len > max_len {
1284 max_len = line_len;
1285 }
1286
1287 max_len
1288}
1289
1290pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1297 let mut max_len: u64 = 0;
1298 let mut line_len: u64 = 0;
1299 let mut linepos: u64 = 0;
1300 let mut i = 0;
1301 let len = data.len();
1302
1303 while i < len {
1304 let b = unsafe { *data.get_unchecked(i) };
1305
1306 if b >= 0x21 && b <= 0x7E {
1307 i += 1;
1309 let mut run = 1u64;
1310 while i < len {
1311 let b = unsafe { *data.get_unchecked(i) };
1312 if b >= 0x21 && b <= 0x7E {
1313 run += 1;
1314 i += 1;
1315 } else {
1316 break;
1317 }
1318 }
1319 linepos += run;
1320 if linepos > line_len {
1321 line_len = linepos;
1322 }
1323 } else if b < 0x80 {
1324 match b {
1326 b' ' => {
1327 linepos += 1;
1328 if linepos > line_len {
1329 line_len = linepos;
1330 }
1331 }
1332 b'\n' => {
1333 if line_len > max_len {
1334 max_len = line_len;
1335 }
1336 linepos = 0;
1337 line_len = 0;
1338 }
1339 b'\t' => {
1340 linepos = (linepos + 8) & !7;
1341 if linepos > line_len {
1342 line_len = linepos;
1343 }
1344 }
1345 b'\r' => {
1346 linepos = 0;
1347 }
1348 0x0C => {
1349 if line_len > max_len {
1350 max_len = line_len;
1351 }
1352 linepos = 0;
1353 line_len = 0;
1354 }
1355 _ => {} }
1357 i += 1;
1358 } else {
1359 let (cp, blen) = decode_utf8(&data[i..]);
1361
1362 if cp <= 0x9F {
1364 } else if is_zero_width(cp) {
1366 } else if is_wide_char(cp) {
1368 linepos += 2;
1369 if linepos > line_len {
1370 line_len = linepos;
1371 }
1372 } else {
1373 linepos += 1;
1375 if linepos > line_len {
1376 line_len = linepos;
1377 }
1378 }
1379 i += blen;
1380 }
1381 }
1382
1383 if line_len > max_len {
1385 max_len = line_len;
1386 }
1387
1388 max_len
1389}
1390
1391#[inline]
1393pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1394 if utf8 {
1395 max_line_length_utf8(data)
1396 } else {
1397 max_line_length_c(data)
1398 }
1399}
1400
1401pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1413 if utf8 {
1414 let (lines, words) = count_lines_words_utf8_fused(data);
1415 WcCounts {
1416 lines,
1417 words,
1418 bytes: data.len() as u64,
1419 chars: count_chars_utf8(data),
1420 max_line_length: max_line_length_utf8(data),
1421 }
1422 } else {
1423 WcCounts {
1424 lines: count_lines(data),
1425 words: count_words_locale(data, false),
1426 bytes: data.len() as u64,
1427 chars: data.len() as u64,
1428 max_line_length: max_line_length_c(data),
1429 }
1430 }
1431}
1432
1433#[inline]
1437fn check_ascii_sample(data: &[u8]) -> bool {
1438 let len = data.len();
1439 if len == 0 {
1440 return true;
1441 }
1442
1443 let check_region = |start: usize, end: usize| -> bool {
1445 let mut or_acc = 0u8;
1446 let region = &data[start..end];
1447 let mut i = 0;
1448 while i + 8 <= region.len() {
1449 unsafe {
1450 or_acc |= *region.get_unchecked(i);
1451 or_acc |= *region.get_unchecked(i + 1);
1452 or_acc |= *region.get_unchecked(i + 2);
1453 or_acc |= *region.get_unchecked(i + 3);
1454 or_acc |= *region.get_unchecked(i + 4);
1455 or_acc |= *region.get_unchecked(i + 5);
1456 or_acc |= *region.get_unchecked(i + 6);
1457 or_acc |= *region.get_unchecked(i + 7);
1458 }
1459 i += 8;
1460 }
1461 while i < region.len() {
1462 or_acc |= region[i];
1463 i += 1;
1464 }
1465 or_acc < 0x80
1466 };
1467
1468 let sample = 256.min(len);
1469
1470 if !check_region(0, sample) {
1472 return false;
1473 }
1474 if len > sample * 2 {
1476 let mid = len / 2;
1477 let mid_start = mid.saturating_sub(sample / 2);
1478 if !check_region(mid_start, (mid_start + sample).min(len)) {
1479 return false;
1480 }
1481 }
1482 if len > sample {
1484 if !check_region(len - sample, len) {
1485 return false;
1486 }
1487 }
1488
1489 true
1490}
1491
1492fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1501 if data.is_empty() || num_chunks <= 1 {
1502 return vec![data];
1503 }
1504 let chunk_size = data.len() / num_chunks;
1505 let mut chunks = Vec::with_capacity(num_chunks);
1506 let mut pos = 0;
1507
1508 for _ in 0..num_chunks - 1 {
1509 let target = pos + chunk_size;
1510 if target >= data.len() {
1511 break;
1512 }
1513 let boundary = memchr::memchr(b'\n', &data[target..])
1514 .map(|p| target + p + 1)
1515 .unwrap_or(data.len());
1516 if boundary > pos {
1517 chunks.push(&data[pos..boundary]);
1518 }
1519 pos = boundary;
1520 }
1521 if pos < data.len() {
1522 chunks.push(&data[pos..]);
1523 }
1524 chunks
1525}
1526
1527pub fn count_lines_parallel(data: &[u8]) -> u64 {
1530 if data.len() < PARALLEL_THRESHOLD {
1531 return count_lines(data);
1532 }
1533
1534 let num_threads = rayon::current_num_threads().max(1);
1535 let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1537
1538 data.par_chunks(chunk_size)
1539 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1540 .sum()
1541}
1542
1543pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1545 if data.len() < PARALLEL_THRESHOLD {
1546 return count_words_locale(data, utf8);
1547 }
1548
1549 let num_threads = rayon::current_num_threads().max(1);
1550
1551 if utf8 {
1552 let chunks = split_at_newlines(data, num_threads);
1555 chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1556 } else {
1557 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1559
1560 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1561
1562 let results: Vec<(u64, u64, bool, bool)> = chunks
1564 .par_iter()
1565 .map(|chunk| count_lw_c_chunk(chunk))
1566 .collect();
1567
1568 let mut total = 0u64;
1569 for i in 0..results.len() {
1570 total += results[i].1;
1571 if i > 0 && results[i - 1].3 && results[i].2 {
1575 total -= 1;
1576 }
1577 }
1578 total
1579 }
1580}
1581
1582pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1584 if !utf8 {
1585 return data.len() as u64;
1586 }
1587 if data.len() < PARALLEL_THRESHOLD {
1588 return count_chars_utf8(data);
1589 }
1590
1591 let num_threads = rayon::current_num_threads().max(1);
1592 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1593
1594 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1595}
1596
1597pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1600 let (lines, words) = count_lines_words(data, utf8);
1601 (lines, words, data.len() as u64)
1602}
1603
1604pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1610 if data.len() < PARALLEL_THRESHOLD {
1611 return count_lwb(data, utf8);
1613 }
1614
1615 let num_threads = rayon::current_num_threads().max(1);
1616
1617 let (lines, words) = if !utf8 {
1618 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1620
1621 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1622 let results: Vec<(u64, u64, bool, bool)> = chunks
1623 .par_iter()
1624 .map(|chunk| count_lw_c_chunk_fast(chunk))
1625 .collect();
1626
1627 let mut line_total = 0u64;
1628 let mut word_total = 0u64;
1629 for i in 0..results.len() {
1630 line_total += results[i].0;
1631 word_total += results[i].1;
1632 if i > 0 && results[i - 1].3 && results[i].2 {
1633 word_total -= 1;
1634 }
1635 }
1636
1637 (line_total, word_total)
1638 } else {
1639 let is_ascii = check_ascii_sample(data);
1641 if is_ascii {
1642 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1644 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1645 let results: Vec<(u64, u64, bool, bool)> = chunks
1646 .par_iter()
1647 .map(|chunk| count_lw_c_chunk_fast(chunk))
1648 .collect();
1649
1650 let mut line_total = 0u64;
1651 let mut word_total = 0u64;
1652 for i in 0..results.len() {
1653 line_total += results[i].0;
1654 word_total += results[i].1;
1655 if i > 0 && results[i - 1].3 && results[i].2 {
1656 word_total -= 1;
1657 }
1658 }
1659 (line_total, word_total)
1660 } else {
1661 let chunks = split_at_newlines(data, num_threads);
1664 let results: Vec<(u64, u64)> = chunks
1665 .par_iter()
1666 .map(|chunk| count_lines_words_utf8_fused(chunk))
1667 .collect();
1668 let mut line_total = 0u64;
1669 let mut word_total = 0u64;
1670 for (l, w) in results {
1671 line_total += l;
1672 word_total += w;
1673 }
1674 (line_total, word_total)
1675 }
1676 };
1677
1678 (lines, words, data.len() as u64)
1679}
1680
1681pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1685 if data.len() < PARALLEL_THRESHOLD {
1686 let lines = count_lines(data);
1687 let words = count_words_locale(data, utf8);
1688 let chars = count_chars(data, utf8);
1689 return (lines, words, chars);
1690 }
1691
1692 let num_threads = rayon::current_num_threads().max(1);
1693
1694 if utf8 {
1695 let chunks = split_at_newlines(data, num_threads);
1697 let results: Vec<(u64, u64, u64)> = chunks
1698 .par_iter()
1699 .map(|chunk| {
1700 let (lines, words) = count_lines_words_utf8_fused(chunk);
1701 let chars = count_chars_utf8(chunk);
1702 (lines, words, chars)
1703 })
1704 .collect();
1705 let mut lines = 0u64;
1706 let mut words = 0u64;
1707 let mut chars = 0u64;
1708 for (l, w, c) in results {
1709 lines += l;
1710 words += w;
1711 chars += c;
1712 }
1713 (lines, words, chars)
1714 } else {
1715 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1717 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1718 let results: Vec<(u64, u64, bool, bool)> = chunks
1719 .par_iter()
1720 .map(|chunk| count_lw_c_chunk_fast(chunk))
1721 .collect();
1722 let mut lines = 0u64;
1723 let mut words = 0u64;
1724 for i in 0..results.len() {
1725 lines += results[i].0;
1726 words += results[i].1;
1727 if i > 0 && results[i - 1].3 && results[i].2 {
1728 words -= 1;
1729 }
1730 }
1731 (lines, words, data.len() as u64)
1732 }
1733}
1734
1735pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1739 if data.len() < PARALLEL_THRESHOLD {
1740 return max_line_length(data, utf8);
1741 }
1742 let num_threads = rayon::current_num_threads().max(1);
1743 let chunks = split_at_newlines(data, num_threads);
1744 chunks
1745 .par_iter()
1746 .map(|chunk| {
1747 if utf8 {
1748 max_line_length_utf8(chunk)
1749 } else {
1750 max_line_length_c(chunk)
1751 }
1752 })
1753 .max()
1754 .unwrap_or(0)
1755}
1756
1757pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1761 if data.len() < PARALLEL_THRESHOLD {
1762 return count_all(data, utf8);
1763 }
1764
1765 let num_threads = rayon::current_num_threads().max(1);
1766 let chunks = split_at_newlines(data, num_threads);
1767
1768 if utf8 {
1769 let results: Vec<(u64, u64, u64, u64)> = chunks
1770 .par_iter()
1771 .map(|chunk| {
1772 let (lines, words) = count_lines_words_utf8_fused(chunk);
1773 let chars = count_chars_utf8(chunk);
1774 let max_ll = max_line_length_utf8(chunk);
1775 (lines, words, chars, max_ll)
1776 })
1777 .collect();
1778
1779 let mut counts = WcCounts {
1780 bytes: data.len() as u64,
1781 ..Default::default()
1782 };
1783 for (l, w, c, m) in results {
1784 counts.lines += l;
1785 counts.words += w;
1786 counts.chars += c;
1787 if m > counts.max_line_length {
1788 counts.max_line_length = m;
1789 }
1790 }
1791 counts
1792 } else {
1793 let results: Vec<(u64, u64, u64)> = chunks
1795 .par_iter()
1796 .map(|chunk| {
1797 let (lines, words) = count_lines_words(chunk, false);
1798 let max_ll = max_line_length_c(chunk);
1799 (lines, words, max_ll)
1800 })
1801 .collect();
1802
1803 let mut counts = WcCounts {
1804 bytes: data.len() as u64,
1805 chars: data.len() as u64,
1806 ..Default::default()
1807 };
1808 for (l, w, m) in &results {
1809 counts.lines += l;
1810 counts.words += w;
1811 if *m > counts.max_line_length {
1812 counts.max_line_length = *m;
1813 }
1814 }
1815 counts
1816 }
1817}