1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12 pub lines: u64,
13 pub words: u64,
14 pub bytes: u64,
15 pub chars: u64,
16 pub max_line_length: u64,
17}
18
19const BYTE_CLASS_C: [u8; 256] = BYTE_CLASS_UTF8;
46
47const fn make_byte_class_utf8() -> [u8; 256] {
50 let mut t = [2u8; 256]; t[0x09] = 1; t[0x0A] = 1; t[0x0B] = 1; t[0x0C] = 1; t[0x0D] = 1; t[0x20] = 1; let mut i = 0x21u16;
60 while i <= 0x7E {
61 t[i as usize] = 0;
62 i += 1;
63 }
64 t
65}
66
67const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
68
69#[inline]
76fn is_unicode_space(cp: u32) -> bool {
77 matches!(
78 cp,
79 0x00A0 | 0x1680 | 0x2000
82 ..=0x200A | 0x2028 | 0x2029 | 0x202F | 0x205F | 0x3000 )
89}
90
91#[inline]
95fn is_unicode_printable(cp: u32) -> bool {
96 cp >= 0xA0
97}
98
99#[inline]
106pub fn count_lines(data: &[u8]) -> u64 {
107 memchr_iter(b'\n', data).count() as u64
108}
109
110#[inline]
112pub fn count_bytes(data: &[u8]) -> u64 {
113 data.len() as u64
114}
115
116pub fn count_words(data: &[u8]) -> u64 {
118 count_words_locale(data, true)
119}
120
121pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
128 if utf8 {
129 count_words_utf8(data)
130 } else {
131 count_words_c(data)
132 }
133}
134
135fn count_words_c(data: &[u8]) -> u64 {
145 let mut words = 0u64;
146 let mut in_word = false;
147 let mut i = 0;
148 let len = data.len();
149
150 while i < len {
151 let b = unsafe { *data.get_unchecked(i) };
152 if b >= 0x21 && b <= 0x7E {
153 if !in_word {
155 in_word = true;
156 words += 1;
157 }
158 i += 1;
159 while i < len {
161 let b = unsafe { *data.get_unchecked(i) };
162 if b >= 0x21 && b <= 0x7E {
163 i += 1;
164 } else {
165 break;
166 }
167 }
168 } else {
169 let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
170 if class == 1 {
171 in_word = false;
173 }
174 i += 1;
176 }
177 }
178 words
179}
180
181#[cfg(target_arch = "x86_64")]
189#[target_feature(enable = "avx2")]
190unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
191 use std::arch::x86_64::*;
192
193 let len = data.len();
194 let ptr = data.as_ptr();
195 let mut i = 0usize;
196 let mut total_lines = 0u64;
197 let mut total_words = 0u64;
198 let mut prev_in_word = false;
201
202 unsafe {
203 let nl_byte = _mm256_set1_epi8(b'\n' as i8);
204 let zero = _mm256_setzero_si256();
205 let ones = _mm256_set1_epi8(1);
206 let lo_print = _mm256_set1_epi8(0x20i8); let hi_print = _mm256_set1_epi8(0x7Fi8); let space_char = _mm256_set1_epi8(0x20i8);
211 let tab_lo = _mm256_set1_epi8(0x08i8);
212 let tab_hi = _mm256_set1_epi8(0x0Ei8);
213
214 let mut line_acc = _mm256_setzero_si256();
215 let mut batch = 0u32;
216
217 while i + 32 <= len {
218 let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
219 let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
220 line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
221
222 let gt_20 = _mm256_cmpgt_epi8(v, lo_print);
224 let lt_7f = _mm256_cmpgt_epi8(hi_print, v);
225 let is_printable = _mm256_and_si256(gt_20, lt_7f);
226
227 let is_sp = _mm256_cmpeq_epi8(v, space_char);
229 let gt_08 = _mm256_cmpgt_epi8(v, tab_lo);
230 let lt_0e = _mm256_cmpgt_epi8(tab_hi, v);
231 let is_tab_range = _mm256_and_si256(gt_08, lt_0e);
232 let is_space = _mm256_or_si256(is_sp, is_tab_range);
233
234 let print_mask = _mm256_movemask_epi8(is_printable) as u32;
235 let space_mask = _mm256_movemask_epi8(is_space) as u32;
236
237 if (print_mask | space_mask) == 0xFFFF_FFFF {
240 let prev_mask = (print_mask << 1) | (prev_in_word as u32);
242 total_words += (print_mask & !prev_mask).count_ones() as u64;
243 prev_in_word = (print_mask >> 31) & 1 == 1;
244 } else {
245 let mut local_words = 0u32;
247 let mut in_word = prev_in_word;
248 let mut pm = print_mask;
249 let mut sm = space_mask;
250 for _ in 0..32 {
251 let p = pm & 1;
252 let s = sm & 1;
253 if s != 0 {
254 in_word = false;
255 } else if p != 0 {
256 if !in_word {
257 local_words += 1;
258 in_word = true;
259 }
260 }
261 pm >>= 1;
262 sm >>= 1;
263 }
264 total_words += local_words as u64;
265 prev_in_word = in_word;
266 }
267
268 batch += 1;
269 if batch >= 255 {
270 let sad = _mm256_sad_epu8(line_acc, zero);
271 let hi = _mm256_extracti128_si256(sad, 1);
272 let lo = _mm256_castsi256_si128(sad);
273 let s = _mm_add_epi64(lo, hi);
274 let h64 = _mm_unpackhi_epi64(s, s);
275 let t = _mm_add_epi64(s, h64);
276 total_lines += _mm_cvtsi128_si64(t) as u64;
277 line_acc = _mm256_setzero_si256();
278 batch = 0;
279 }
280 i += 32;
281 }
282
283 if batch > 0 {
284 let sad = _mm256_sad_epu8(line_acc, zero);
285 let hi = _mm256_extracti128_si256(sad, 1);
286 let lo = _mm256_castsi256_si128(sad);
287 let s = _mm_add_epi64(lo, hi);
288 let h64 = _mm_unpackhi_epi64(s, s);
289 let t = _mm_add_epi64(s, h64);
290 total_lines += _mm_cvtsi128_si64(t) as u64;
291 }
292
293 while i < len {
295 let b = *ptr.add(i);
296 if b == b'\n' {
297 total_lines += 1;
298 prev_in_word = false;
299 } else if b >= 0x21 && b <= 0x7E {
300 if !prev_in_word {
302 total_words += 1;
303 prev_in_word = true;
304 }
305 } else if *BYTE_CLASS_C.get_unchecked(b as usize) == 1 {
306 prev_in_word = false;
308 }
309 i += 1;
311 }
312 }
313
314 let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] == 0;
315 (total_lines, total_words, first_is_word, prev_in_word)
316}
317
318#[cfg(target_arch = "x86_64")]
323#[target_feature(enable = "sse2")]
324unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
325 use std::arch::x86_64::*;
326
327 let len = data.len();
328 let ptr = data.as_ptr();
329 let mut i = 0usize;
330 let mut total_lines = 0u64;
331 let mut total_words = 0u64;
332 let mut prev_in_word = false;
333
334 unsafe {
335 let nl_byte = _mm_set1_epi8(b'\n' as i8);
336 let zero = _mm_setzero_si128();
337 let ones = _mm_set1_epi8(1);
338 let lo_print = _mm_set1_epi8(0x20i8);
340 let hi_print = _mm_set1_epi8(0x7Fi8);
341 let space_char = _mm_set1_epi8(0x20i8);
343 let tab_lo = _mm_set1_epi8(0x08i8);
344 let tab_hi = _mm_set1_epi8(0x0Ei8);
345
346 let mut line_acc = _mm_setzero_si128();
347 let mut batch = 0u32;
348
349 while i + 16 <= len {
350 let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
351 let is_nl = _mm_cmpeq_epi8(v, nl_byte);
352 line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
353
354 let gt_20 = _mm_cmpgt_epi8(v, lo_print);
356 let lt_7f = _mm_cmpgt_epi8(hi_print, v);
357 let is_printable = _mm_and_si128(gt_20, lt_7f);
358
359 let is_sp = _mm_cmpeq_epi8(v, space_char);
361 let gt_08 = _mm_cmpgt_epi8(v, tab_lo);
362 let lt_0e = _mm_cmpgt_epi8(tab_hi, v);
363 let is_tab_range = _mm_and_si128(gt_08, lt_0e);
364 let is_space = _mm_or_si128(is_sp, is_tab_range);
365
366 let print_mask = _mm_movemask_epi8(is_printable) as u32;
367 let space_mask = _mm_movemask_epi8(is_space) as u32;
368
369 if (print_mask | space_mask) == 0xFFFF {
372 let prev_mask = (print_mask << 1) | (prev_in_word as u32);
373 total_words += (print_mask & !prev_mask).count_ones() as u64;
374 prev_in_word = (print_mask >> 15) & 1 == 1;
375 } else {
376 let mut local_words = 0u32;
378 let mut in_word = prev_in_word;
379 let mut pm = print_mask;
380 let mut sm = space_mask;
381 for _ in 0..16 {
382 let p = pm & 1;
383 let s = sm & 1;
384 if s != 0 {
385 in_word = false;
386 } else if p != 0 {
387 if !in_word {
388 local_words += 1;
389 in_word = true;
390 }
391 }
392 pm >>= 1;
393 sm >>= 1;
394 }
395 total_words += local_words as u64;
396 prev_in_word = in_word;
397 }
398
399 batch += 1;
400 if batch >= 255 {
401 let sad = _mm_sad_epu8(line_acc, zero);
402 let hi = _mm_unpackhi_epi64(sad, sad);
403 let t = _mm_add_epi64(sad, hi);
404 total_lines += _mm_cvtsi128_si64(t) as u64;
405 line_acc = _mm_setzero_si128();
406 batch = 0;
407 }
408 i += 16;
409 }
410
411 if batch > 0 {
412 let sad = _mm_sad_epu8(line_acc, zero);
413 let hi = _mm_unpackhi_epi64(sad, sad);
414 let t = _mm_add_epi64(sad, hi);
415 total_lines += _mm_cvtsi128_si64(t) as u64;
416 }
417
418 while i < len {
420 let b = *ptr.add(i);
421 if b == b'\n' {
422 total_lines += 1;
423 prev_in_word = false;
424 } else if b >= 0x21 && b <= 0x7E {
425 if !prev_in_word {
426 total_words += 1;
427 prev_in_word = true;
428 }
429 } else if *BYTE_CLASS_C.get_unchecked(b as usize) == 1 {
430 prev_in_word = false;
431 }
432 i += 1;
433 }
434 }
435
436 let first_is_word = !data.is_empty() && BYTE_CLASS_C[data[0] as usize] == 0;
437 (total_lines, total_words, first_is_word, prev_in_word)
438}
439
440#[inline]
442fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
443 #[cfg(target_arch = "x86_64")]
444 {
445 if is_x86_feature_detected!("avx2") && data.len() >= 64 {
446 return unsafe { count_lw_c_chunk_avx2(data) };
447 }
448 if data.len() >= 32 {
449 return unsafe { count_lw_c_chunk_sse2(data) };
450 }
451 }
452 count_lw_c_chunk(data)
453}
454
455fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
461 let mut lines = 0u64;
462 let mut words = 0u64;
463 let mut in_word = false;
464 let mut first_is_printable = false;
465 let mut seen_first_non_transparent = false;
466 let mut i = 0;
467 let len = data.len();
468
469 while i < len {
470 let b = unsafe { *data.get_unchecked(i) };
471 if b >= 0x21 && b <= 0x7E {
472 if !seen_first_non_transparent {
474 seen_first_non_transparent = true;
475 first_is_printable = true;
476 }
477 if !in_word {
478 in_word = true;
479 words += 1;
480 }
481 i += 1;
482 while i < len {
484 let b = unsafe { *data.get_unchecked(i) };
485 if b >= 0x21 && b <= 0x7E {
486 i += 1;
487 } else {
488 break;
489 }
490 }
491 } else {
492 let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
493 if class == 1 {
494 if !seen_first_non_transparent {
496 seen_first_non_transparent = true;
497 }
499 if b == b'\n' {
500 lines += 1;
501 }
502 in_word = false;
503 }
504 i += 1;
506 }
507 }
508 (lines, words, first_is_printable, in_word)
509}
510
511fn count_words_utf8(data: &[u8]) -> u64 {
524 let mut words = 0u64;
525 let mut in_word = false;
526 let mut i = 0;
527 let len = data.len();
528
529 while i < len {
530 let b = unsafe { *data.get_unchecked(i) };
531
532 if b >= 0x21 && b <= 0x7E {
533 if !in_word {
535 in_word = true;
536 words += 1;
537 }
538 i += 1;
539 while i < len {
541 let b = unsafe { *data.get_unchecked(i) };
542 if b >= 0x21 && b <= 0x7E {
543 i += 1;
544 } else {
545 break;
546 }
547 }
548 } else if b < 0x80 {
549 let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
551 if class == 1 {
552 in_word = false;
553 }
554 i += 1;
556 } else if b < 0xC2 {
557 i += 1;
558 } else if b < 0xE0 {
559 if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
560 let cp = ((b as u32 & 0x1F) << 6)
561 | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
562 if is_unicode_space(cp) {
563 in_word = false;
564 } else if is_unicode_printable(cp) {
565 if !in_word {
566 in_word = true;
567 words += 1;
568 }
569 }
570 i += 2;
571 } else {
572 i += 1;
573 }
574 } else if b < 0xF0 {
575 if i + 2 < len
576 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
577 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
578 {
579 let cp = ((b as u32 & 0x0F) << 12)
580 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
581 | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
582 if is_unicode_space(cp) {
583 in_word = false;
584 } else if is_unicode_printable(cp) {
585 if !in_word {
586 in_word = true;
587 words += 1;
588 }
589 }
590 i += 3;
591 } else {
592 i += 1;
593 }
594 } else if b < 0xF5 {
595 if i + 3 < len
596 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
597 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
598 && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
599 {
600 let cp = ((b as u32 & 0x07) << 18)
601 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
602 | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
603 | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
604 if is_unicode_space(cp) {
605 in_word = false;
606 } else if is_unicode_printable(cp) {
607 if !in_word {
608 in_word = true;
609 words += 1;
610 }
611 }
612 i += 4;
613 } else {
614 i += 1;
615 }
616 } else {
617 i += 1;
618 }
619 }
620
621 words
622}
623
624pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
628 if utf8 {
629 count_lines_words_utf8_fused(data)
630 } else {
631 let (lines, words, _, _) = count_lw_c_chunk_fast(data);
632 (lines, words)
633 }
634}
635
636fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
643 let mut lines = 0u64;
644 let mut words = 0u64;
645 let mut in_word = false;
646 let mut i = 0;
647 let len = data.len();
648
649 while i < len {
650 let b = unsafe { *data.get_unchecked(i) };
651
652 if b >= 0x21 && b <= 0x7E {
653 if !in_word {
655 in_word = true;
656 words += 1;
657 }
658 i += 1;
659 while i < len {
661 let b = unsafe { *data.get_unchecked(i) };
662 if b >= 0x21 && b <= 0x7E {
663 i += 1;
664 } else {
665 break;
666 }
667 }
668 } else if b == b'\n' {
669 lines += 1;
670 in_word = false;
671 i += 1;
672 } else if b == b' ' {
673 in_word = false;
674 i += 1;
675 } else if b < 0x80 {
676 let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
678 if class == 1 {
679 in_word = false;
680 }
681 i += 1;
683 } else if b < 0xC2 {
684 i += 1;
685 } else if b < 0xE0 {
686 if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
687 let cp = ((b as u32 & 0x1F) << 6)
688 | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
689 if is_unicode_space(cp) {
690 in_word = false;
691 } else if is_unicode_printable(cp) {
692 if !in_word {
693 in_word = true;
694 words += 1;
695 }
696 }
697 i += 2;
698 } else {
699 i += 1;
700 }
701 } else if b < 0xF0 {
702 if i + 2 < len
703 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
704 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
705 {
706 let cp = ((b as u32 & 0x0F) << 12)
707 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
708 | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
709 if is_unicode_space(cp) {
710 in_word = false;
711 } else if is_unicode_printable(cp) {
712 if !in_word {
713 in_word = true;
714 words += 1;
715 }
716 }
717 i += 3;
718 } else {
719 i += 1;
720 }
721 } else if b < 0xF5 {
722 if i + 3 < len
723 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
724 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
725 && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
726 {
727 let cp = ((b as u32 & 0x07) << 18)
728 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
729 | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
730 | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
731 if is_unicode_space(cp) {
732 in_word = false;
733 } else if is_unicode_printable(cp) {
734 if !in_word {
735 in_word = true;
736 words += 1;
737 }
738 }
739 i += 4;
740 } else {
741 i += 1;
742 }
743 } else {
744 i += 1;
745 }
746 }
747
748 (lines, words)
749}
750
751pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
753 if utf8 {
754 let (lines, words) = count_lines_words_utf8_fused(data);
756 let chars = count_chars_utf8(data);
757 (lines, words, chars)
758 } else {
759 let (lines, words) = count_lines_words(data, false);
761 (lines, words, data.len() as u64)
762 }
763}
764
765pub fn count_chars_utf8(data: &[u8]) -> u64 {
772 #[cfg(target_arch = "x86_64")]
773 {
774 if is_x86_feature_detected!("avx2") {
775 return unsafe { count_chars_utf8_avx2(data) };
776 }
777 }
778 count_chars_utf8_scalar(data)
779}
780
781#[cfg(target_arch = "x86_64")]
785#[target_feature(enable = "avx2")]
786unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
787 unsafe {
788 use std::arch::x86_64::*;
789
790 let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
791 let val_80 = _mm256_set1_epi8(0x80u8 as i8);
792 let ones = _mm256_set1_epi8(1);
793 let zero = _mm256_setzero_si256();
794
795 let mut total = 0u64;
796 let len = data.len();
797 let ptr = data.as_ptr();
798 let mut i = 0;
799 let mut acc = _mm256_setzero_si256();
800 let mut batch = 0u32;
801
802 while i + 32 <= len {
803 let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
804 let masked = _mm256_and_si256(v, mask_c0);
805 let is_cont = _mm256_cmpeq_epi8(masked, val_80);
806 let non_cont = _mm256_andnot_si256(is_cont, ones);
807 acc = _mm256_add_epi8(acc, non_cont);
808
809 batch += 1;
810 if batch >= 255 {
811 let sad = _mm256_sad_epu8(acc, zero);
813 let hi = _mm256_extracti128_si256(sad, 1);
814 let lo = _mm256_castsi256_si128(sad);
815 let sum = _mm_add_epi64(lo, hi);
816 let hi64 = _mm_unpackhi_epi64(sum, sum);
817 let t = _mm_add_epi64(sum, hi64);
818 total += _mm_cvtsi128_si64(t) as u64;
819 acc = _mm256_setzero_si256();
820 batch = 0;
821 }
822 i += 32;
823 }
824
825 if batch > 0 {
827 let sad = _mm256_sad_epu8(acc, zero);
828 let hi = _mm256_extracti128_si256(sad, 1);
829 let lo = _mm256_castsi256_si128(sad);
830 let sum = _mm_add_epi64(lo, hi);
831 let hi64 = _mm_unpackhi_epi64(sum, sum);
832 let t = _mm_add_epi64(sum, hi64);
833 total += _mm_cvtsi128_si64(t) as u64;
834 }
835
836 while i < len {
837 total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
838 i += 1;
839 }
840
841 total
842 }
843}
844
845fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
847 let mut count = 0u64;
848 let chunks = data.chunks_exact(64);
849 let remainder = chunks.remainder();
850
851 for chunk in chunks {
852 let mut any_high = 0u8;
854 let mut i = 0;
855 while i + 8 <= 64 {
856 unsafe {
857 any_high |= *chunk.get_unchecked(i);
858 any_high |= *chunk.get_unchecked(i + 1);
859 any_high |= *chunk.get_unchecked(i + 2);
860 any_high |= *chunk.get_unchecked(i + 3);
861 any_high |= *chunk.get_unchecked(i + 4);
862 any_high |= *chunk.get_unchecked(i + 5);
863 any_high |= *chunk.get_unchecked(i + 6);
864 any_high |= *chunk.get_unchecked(i + 7);
865 }
866 i += 8;
867 }
868 if any_high < 0x80 {
869 count += 64;
870 continue;
871 }
872
873 let mut char_mask = 0u64;
874 i = 0;
875 while i + 7 < 64 {
876 unsafe {
877 char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
878 char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
879 char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
880 char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
881 char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
882 char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
883 char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
884 char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
885 }
886 i += 8;
887 }
888 count += char_mask.count_ones() as u64;
889 }
890
891 for &b in remainder {
892 count += ((b & 0xC0) != 0x80) as u64;
893 }
894 count
895}
896
897#[inline]
899pub fn count_chars_c(data: &[u8]) -> u64 {
900 data.len() as u64
901}
902
903#[inline]
905pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
906 if utf8 {
907 count_chars_utf8(data)
908 } else {
909 count_chars_c(data)
910 }
911}
912
913pub fn is_utf8_locale() -> bool {
915 for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
916 if let Ok(val) = std::env::var(var) {
917 if !val.is_empty() {
918 let lower = val.to_ascii_lowercase();
919 return lower.contains("utf-8") || lower.contains("utf8");
920 }
921 }
922 }
923 false
924}
925
926#[inline]
929fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
930 let b0 = bytes[0];
931 if b0 < 0x80 {
932 return (b0 as u32, 1);
933 }
934 if b0 < 0xC2 {
935 return (b0 as u32, 1);
937 }
938 if b0 < 0xE0 {
939 if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
940 return (b0 as u32, 1);
941 }
942 let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
943 return (cp, 2);
944 }
945 if b0 < 0xF0 {
946 if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
947 return (b0 as u32, 1);
948 }
949 let cp =
950 ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
951 return (cp, 3);
952 }
953 if b0 < 0xF5 {
954 if bytes.len() < 4
955 || bytes[1] & 0xC0 != 0x80
956 || bytes[2] & 0xC0 != 0x80
957 || bytes[3] & 0xC0 != 0x80
958 {
959 return (b0 as u32, 1);
960 }
961 let cp = ((b0 as u32 & 0x07) << 18)
962 | ((bytes[1] as u32 & 0x3F) << 12)
963 | ((bytes[2] as u32 & 0x3F) << 6)
964 | (bytes[3] as u32 & 0x3F);
965 return (cp, 4);
966 }
967 (b0 as u32, 1)
968}
969
970#[inline]
973fn is_zero_width(cp: u32) -> bool {
974 matches!(
975 cp,
976 0x0300..=0x036F | 0x0483..=0x0489 | 0x0591..=0x05BD | 0x05BF
980 | 0x05C1..=0x05C2
981 | 0x05C4..=0x05C5
982 | 0x05C7
983 | 0x0600..=0x0605 | 0x0610..=0x061A | 0x064B..=0x065F | 0x0670
987 | 0x06D6..=0x06DD
988 | 0x06DF..=0x06E4
989 | 0x06E7..=0x06E8
990 | 0x06EA..=0x06ED
991 | 0x070F
992 | 0x0711
993 | 0x0730..=0x074A
994 | 0x07A6..=0x07B0
995 | 0x07EB..=0x07F3
996 | 0x07FD
997 | 0x0816..=0x0819
998 | 0x081B..=0x0823
999 | 0x0825..=0x0827
1000 | 0x0829..=0x082D
1001 | 0x0859..=0x085B
1002 | 0x08D3..=0x08E1
1003 | 0x08E3..=0x0902
1004 | 0x093A
1005 | 0x093C
1006 | 0x0941..=0x0948
1007 | 0x094D
1008 | 0x0951..=0x0957
1009 | 0x0962..=0x0963
1010 | 0x0981
1011 | 0x09BC
1012 | 0x09C1..=0x09C4
1013 | 0x09CD
1014 | 0x09E2..=0x09E3
1015 | 0x09FE
1016 | 0x0A01..=0x0A02
1017 | 0x0A3C
1018 | 0x0A41..=0x0A42
1019 | 0x0A47..=0x0A48
1020 | 0x0A4B..=0x0A4D
1021 | 0x0A51
1022 | 0x0A70..=0x0A71
1023 | 0x0A75
1024 | 0x0A81..=0x0A82
1025 | 0x0ABC
1026 | 0x0AC1..=0x0AC5
1027 | 0x0AC7..=0x0AC8
1028 | 0x0ACD
1029 | 0x0AE2..=0x0AE3
1030 | 0x0AFA..=0x0AFF
1031 | 0x0B01
1032 | 0x0B3C
1033 | 0x0B3F
1034 | 0x0B41..=0x0B44
1035 | 0x0B4D
1036 | 0x0B56
1037 | 0x0B62..=0x0B63
1038 | 0x0B82
1039 | 0x0BC0
1040 | 0x0BCD
1041 | 0x0C00
1042 | 0x0C04
1043 | 0x0C3E..=0x0C40
1044 | 0x0C46..=0x0C48
1045 | 0x0C4A..=0x0C4D
1046 | 0x0C55..=0x0C56
1047 | 0x0C62..=0x0C63
1048 | 0x0C81
1049 | 0x0CBC
1050 | 0x0CBF
1051 | 0x0CC6
1052 | 0x0CCC..=0x0CCD
1053 | 0x0CE2..=0x0CE3
1054 | 0x0D00..=0x0D01
1055 | 0x0D3B..=0x0D3C
1056 | 0x0D41..=0x0D44
1057 | 0x0D4D
1058 | 0x0D62..=0x0D63
1059 | 0x0DCA
1060 | 0x0DD2..=0x0DD4
1061 | 0x0DD6
1062 | 0x0E31
1063 | 0x0E34..=0x0E3A
1064 | 0x0E47..=0x0E4E
1065 | 0x0EB1
1066 | 0x0EB4..=0x0EBC
1067 | 0x0EC8..=0x0ECD
1068 | 0x0F18..=0x0F19
1069 | 0x0F35
1070 | 0x0F37
1071 | 0x0F39
1072 | 0x0F71..=0x0F7E
1073 | 0x0F80..=0x0F84
1074 | 0x0F86..=0x0F87
1075 | 0x0F8D..=0x0F97
1076 | 0x0F99..=0x0FBC
1077 | 0x0FC6
1078 | 0x102D..=0x1030
1079 | 0x1032..=0x1037
1080 | 0x1039..=0x103A
1081 | 0x103D..=0x103E
1082 | 0x1058..=0x1059
1083 | 0x105E..=0x1060
1084 | 0x1071..=0x1074
1085 | 0x1082
1086 | 0x1085..=0x1086
1087 | 0x108D
1088 | 0x109D
1089 | 0x1160..=0x11FF | 0x135D..=0x135F
1091 | 0x1712..=0x1714
1092 | 0x1732..=0x1734
1093 | 0x1752..=0x1753
1094 | 0x1772..=0x1773
1095 | 0x17B4..=0x17B5
1096 | 0x17B7..=0x17BD
1097 | 0x17C6
1098 | 0x17C9..=0x17D3
1099 | 0x17DD
1100 | 0x180B..=0x180D
1101 | 0x1885..=0x1886
1102 | 0x18A9
1103 | 0x1920..=0x1922
1104 | 0x1927..=0x1928
1105 | 0x1932
1106 | 0x1939..=0x193B
1107 | 0x1A17..=0x1A18
1108 | 0x1A1B
1109 | 0x1A56
1110 | 0x1A58..=0x1A5E
1111 | 0x1A60
1112 | 0x1A62
1113 | 0x1A65..=0x1A6C
1114 | 0x1A73..=0x1A7C
1115 | 0x1A7F
1116 | 0x1AB0..=0x1ABE
1117 | 0x1B00..=0x1B03
1118 | 0x1B34
1119 | 0x1B36..=0x1B3A
1120 | 0x1B3C
1121 | 0x1B42
1122 | 0x1B6B..=0x1B73
1123 | 0x1B80..=0x1B81
1124 | 0x1BA2..=0x1BA5
1125 | 0x1BA8..=0x1BA9
1126 | 0x1BAB..=0x1BAD
1127 | 0x1BE6
1128 | 0x1BE8..=0x1BE9
1129 | 0x1BED
1130 | 0x1BEF..=0x1BF1
1131 | 0x1C2C..=0x1C33
1132 | 0x1C36..=0x1C37
1133 | 0x1CD0..=0x1CD2
1134 | 0x1CD4..=0x1CE0
1135 | 0x1CE2..=0x1CE8
1136 | 0x1CED
1137 | 0x1CF4
1138 | 0x1CF8..=0x1CF9
1139 | 0x1DC0..=0x1DF9
1140 | 0x1DFB..=0x1DFF
1141 | 0x200B..=0x200F | 0x202A..=0x202E | 0x2060..=0x2064 | 0x2066..=0x206F | 0x20D0..=0x20F0 | 0xFE00..=0xFE0F | 0xFE20..=0xFE2F | 0xFEFF | 0xFFF9..=0xFFFB | 0x1D167..=0x1D169
1151 | 0x1D173..=0x1D182
1152 | 0x1D185..=0x1D18B
1153 | 0x1D1AA..=0x1D1AD
1154 | 0x1D242..=0x1D244
1155 | 0xE0001
1156 | 0xE0020..=0xE007F
1157 | 0xE0100..=0xE01EF )
1159}
1160
1161#[inline]
1164fn is_wide_char(cp: u32) -> bool {
1165 matches!(
1166 cp,
1167 0x1100..=0x115F | 0x231A..=0x231B | 0x2329..=0x232A | 0x23E9..=0x23F3 | 0x23F8..=0x23FA
1172 | 0x25FD..=0x25FE
1173 | 0x2614..=0x2615
1174 | 0x2648..=0x2653
1175 | 0x267F
1176 | 0x2693
1177 | 0x26A1
1178 | 0x26AA..=0x26AB
1179 | 0x26BD..=0x26BE
1180 | 0x26C4..=0x26C5
1181 | 0x26CE
1182 | 0x26D4
1183 | 0x26EA
1184 | 0x26F2..=0x26F3
1185 | 0x26F5
1186 | 0x26FA
1187 | 0x26FD
1188 | 0x2702
1189 | 0x2705
1190 | 0x2708..=0x270D
1191 | 0x270F
1192 | 0x2712
1193 | 0x2714
1194 | 0x2716
1195 | 0x271D
1196 | 0x2721
1197 | 0x2728
1198 | 0x2733..=0x2734
1199 | 0x2744
1200 | 0x2747
1201 | 0x274C
1202 | 0x274E
1203 | 0x2753..=0x2755
1204 | 0x2757
1205 | 0x2763..=0x2764
1206 | 0x2795..=0x2797
1207 | 0x27A1
1208 | 0x27B0
1209 | 0x27BF
1210 | 0x2934..=0x2935
1211 | 0x2B05..=0x2B07
1212 | 0x2B1B..=0x2B1C
1213 | 0x2B50
1214 | 0x2B55
1215 | 0x2E80..=0x303E | 0x3040..=0x33BF | 0x3400..=0x4DBF | 0x4E00..=0xA4CF | 0xA960..=0xA97C | 0xAC00..=0xD7A3 | 0xF900..=0xFAFF | 0xFE10..=0xFE19 | 0xFE30..=0xFE6F | 0xFF01..=0xFF60 | 0xFFE0..=0xFFE6 | 0x1F004
1227 | 0x1F0CF
1228 | 0x1F170..=0x1F171
1229 | 0x1F17E..=0x1F17F
1230 | 0x1F18E
1231 | 0x1F191..=0x1F19A
1232 | 0x1F1E0..=0x1F1FF | 0x1F200..=0x1F202
1234 | 0x1F210..=0x1F23B
1235 | 0x1F240..=0x1F248
1236 | 0x1F250..=0x1F251
1237 | 0x1F260..=0x1F265
1238 | 0x1F300..=0x1F64F | 0x1F680..=0x1F6FF | 0x1F900..=0x1F9FF | 0x1FA00..=0x1FA6F
1242 | 0x1FA70..=0x1FAFF
1243 | 0x20000..=0x2FFFD | 0x30000..=0x3FFFD )
1246}
1247
1248pub fn max_line_length_c(data: &[u8]) -> u64 {
1261 let mut max_len: u64 = 0;
1262 let mut line_len: u64 = 0;
1263 let mut linepos: u64 = 0;
1264 let mut i = 0;
1265 let len = data.len();
1266
1267 while i < len {
1268 let b = unsafe { *data.get_unchecked(i) };
1269 if b >= 0x21 && b <= 0x7E {
1270 i += 1;
1272 let mut run = 1u64;
1273 while i < len {
1274 let b = unsafe { *data.get_unchecked(i) };
1275 if b >= 0x21 && b <= 0x7E {
1276 run += 1;
1277 i += 1;
1278 } else {
1279 break;
1280 }
1281 }
1282 linepos += run;
1283 if linepos > line_len {
1284 line_len = linepos;
1285 }
1286 } else {
1287 match b {
1288 b' ' => {
1289 linepos += 1;
1290 if linepos > line_len {
1291 line_len = linepos;
1292 }
1293 }
1294 b'\n' => {
1295 if line_len > max_len {
1296 max_len = line_len;
1297 }
1298 linepos = 0;
1299 line_len = 0;
1300 }
1301 b'\t' => {
1302 linepos = (linepos + 8) & !7;
1303 if linepos > line_len {
1304 line_len = linepos;
1305 }
1306 }
1307 b'\r' => {
1308 linepos = 0;
1309 }
1310 0x0C => {
1311 if line_len > max_len {
1312 max_len = line_len;
1313 }
1314 linepos = 0;
1315 line_len = 0;
1316 }
1317 _ => {} }
1319 i += 1;
1320 }
1321 }
1322
1323 if line_len > max_len {
1324 max_len = line_len;
1325 }
1326
1327 max_len
1328}
1329
1330pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1337 let mut max_len: u64 = 0;
1338 let mut line_len: u64 = 0;
1339 let mut linepos: u64 = 0;
1340 let mut i = 0;
1341 let len = data.len();
1342
1343 while i < len {
1344 let b = unsafe { *data.get_unchecked(i) };
1345
1346 if b >= 0x21 && b <= 0x7E {
1347 i += 1;
1349 let mut run = 1u64;
1350 while i < len {
1351 let b = unsafe { *data.get_unchecked(i) };
1352 if b >= 0x21 && b <= 0x7E {
1353 run += 1;
1354 i += 1;
1355 } else {
1356 break;
1357 }
1358 }
1359 linepos += run;
1360 if linepos > line_len {
1361 line_len = linepos;
1362 }
1363 } else if b < 0x80 {
1364 match b {
1366 b' ' => {
1367 linepos += 1;
1368 if linepos > line_len {
1369 line_len = linepos;
1370 }
1371 }
1372 b'\n' => {
1373 if line_len > max_len {
1374 max_len = line_len;
1375 }
1376 linepos = 0;
1377 line_len = 0;
1378 }
1379 b'\t' => {
1380 linepos = (linepos + 8) & !7;
1381 if linepos > line_len {
1382 line_len = linepos;
1383 }
1384 }
1385 b'\r' => {
1386 linepos = 0;
1387 }
1388 0x0C => {
1389 if line_len > max_len {
1390 max_len = line_len;
1391 }
1392 linepos = 0;
1393 line_len = 0;
1394 }
1395 _ => {} }
1397 i += 1;
1398 } else {
1399 let (cp, len) = decode_utf8(&data[i..]);
1401
1402 if cp <= 0x9F {
1404 } else if is_zero_width(cp) {
1406 } else if is_wide_char(cp) {
1408 linepos += 2;
1409 if linepos > line_len {
1410 line_len = linepos;
1411 }
1412 } else {
1413 linepos += 1;
1415 if linepos > line_len {
1416 line_len = linepos;
1417 }
1418 }
1419 i += len;
1420 }
1421 }
1422
1423 if line_len > max_len {
1425 max_len = line_len;
1426 }
1427
1428 max_len
1429}
1430
1431#[inline]
1433pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1434 if utf8 {
1435 max_line_length_utf8(data)
1436 } else {
1437 max_line_length_c(data)
1438 }
1439}
1440
1441pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1453 if utf8 {
1454 let (lines, words) = count_lines_words_utf8_fused(data);
1455 WcCounts {
1456 lines,
1457 words,
1458 bytes: data.len() as u64,
1459 chars: count_chars_utf8(data),
1460 max_line_length: max_line_length_utf8(data),
1461 }
1462 } else {
1463 WcCounts {
1464 lines: count_lines(data),
1465 words: count_words_locale(data, false),
1466 bytes: data.len() as u64,
1467 chars: data.len() as u64,
1468 max_line_length: max_line_length_c(data),
1469 }
1470 }
1471}
1472
1473#[inline]
1477fn check_ascii_sample(data: &[u8]) -> bool {
1478 let len = data.len();
1479 if len == 0 {
1480 return true;
1481 }
1482
1483 let check_region = |start: usize, end: usize| -> bool {
1485 let mut or_acc = 0u8;
1486 let region = &data[start..end];
1487 let mut i = 0;
1488 while i + 8 <= region.len() {
1489 unsafe {
1490 or_acc |= *region.get_unchecked(i);
1491 or_acc |= *region.get_unchecked(i + 1);
1492 or_acc |= *region.get_unchecked(i + 2);
1493 or_acc |= *region.get_unchecked(i + 3);
1494 or_acc |= *region.get_unchecked(i + 4);
1495 or_acc |= *region.get_unchecked(i + 5);
1496 or_acc |= *region.get_unchecked(i + 6);
1497 or_acc |= *region.get_unchecked(i + 7);
1498 }
1499 i += 8;
1500 }
1501 while i < region.len() {
1502 or_acc |= region[i];
1503 i += 1;
1504 }
1505 or_acc < 0x80
1506 };
1507
1508 let sample = 256.min(len);
1509
1510 if !check_region(0, sample) {
1512 return false;
1513 }
1514 if len > sample * 2 {
1516 let mid = len / 2;
1517 let mid_start = mid.saturating_sub(sample / 2);
1518 if !check_region(mid_start, (mid_start + sample).min(len)) {
1519 return false;
1520 }
1521 }
1522 if len > sample {
1524 if !check_region(len - sample, len) {
1525 return false;
1526 }
1527 }
1528
1529 true
1530}
1531
1532fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
1541 if data.is_empty() || num_chunks <= 1 {
1542 return vec![data];
1543 }
1544 let chunk_size = data.len() / num_chunks;
1545 let mut chunks = Vec::with_capacity(num_chunks);
1546 let mut pos = 0;
1547
1548 for _ in 0..num_chunks - 1 {
1549 let target = pos + chunk_size;
1550 if target >= data.len() {
1551 break;
1552 }
1553 let boundary = memchr::memchr(b'\n', &data[target..])
1554 .map(|p| target + p + 1)
1555 .unwrap_or(data.len());
1556 if boundary > pos {
1557 chunks.push(&data[pos..boundary]);
1558 }
1559 pos = boundary;
1560 }
1561 if pos < data.len() {
1562 chunks.push(&data[pos..]);
1563 }
1564 chunks
1565}
1566
1567pub fn count_lines_parallel(data: &[u8]) -> u64 {
1570 if data.len() < PARALLEL_THRESHOLD {
1571 return count_lines(data);
1572 }
1573
1574 let num_threads = rayon::current_num_threads().max(1);
1575 let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1577
1578 data.par_chunks(chunk_size)
1579 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1580 .sum()
1581}
1582
1583pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1585 if data.len() < PARALLEL_THRESHOLD {
1586 return count_words_locale(data, utf8);
1587 }
1588
1589 let num_threads = rayon::current_num_threads().max(1);
1590
1591 if utf8 {
1592 let chunks = split_at_newlines(data, num_threads);
1595 chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
1596 } else {
1597 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1599
1600 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1601
1602 let results: Vec<(u64, u64, bool, bool)> = chunks
1604 .par_iter()
1605 .map(|chunk| count_lw_c_chunk(chunk))
1606 .collect();
1607
1608 let mut total = 0u64;
1609 for i in 0..results.len() {
1610 total += results[i].1;
1611 if i > 0 && results[i - 1].3 && results[i].2 {
1615 total -= 1;
1616 }
1617 }
1618 total
1619 }
1620}
1621
1622pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1624 if !utf8 {
1625 return data.len() as u64;
1626 }
1627 if data.len() < PARALLEL_THRESHOLD {
1628 return count_chars_utf8(data);
1629 }
1630
1631 let num_threads = rayon::current_num_threads().max(1);
1632 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1633
1634 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1635}
1636
1637pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1640 let (lines, words) = count_lines_words(data, utf8);
1641 (lines, words, data.len() as u64)
1642}
1643
1644pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1650 if data.len() < PARALLEL_THRESHOLD {
1651 return count_lwb(data, utf8);
1653 }
1654
1655 let num_threads = rayon::current_num_threads().max(1);
1656
1657 let (lines, words) = if !utf8 {
1658 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1660
1661 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1662 let results: Vec<(u64, u64, bool, bool)> = chunks
1663 .par_iter()
1664 .map(|chunk| count_lw_c_chunk_fast(chunk))
1665 .collect();
1666
1667 let mut line_total = 0u64;
1668 let mut word_total = 0u64;
1669 for i in 0..results.len() {
1670 line_total += results[i].0;
1671 word_total += results[i].1;
1672 if i > 0 && results[i - 1].3 && results[i].2 {
1673 word_total -= 1;
1674 }
1675 }
1676
1677 (line_total, word_total)
1678 } else {
1679 let is_ascii = check_ascii_sample(data);
1681 if is_ascii {
1682 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1684 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1685 let results: Vec<(u64, u64, bool, bool)> = chunks
1686 .par_iter()
1687 .map(|chunk| count_lw_c_chunk_fast(chunk))
1688 .collect();
1689
1690 let mut line_total = 0u64;
1691 let mut word_total = 0u64;
1692 for i in 0..results.len() {
1693 line_total += results[i].0;
1694 word_total += results[i].1;
1695 if i > 0 && results[i - 1].3 && results[i].2 {
1696 word_total -= 1;
1697 }
1698 }
1699 (line_total, word_total)
1700 } else {
1701 let chunks = split_at_newlines(data, num_threads);
1704 let results: Vec<(u64, u64)> = chunks
1705 .par_iter()
1706 .map(|chunk| count_lines_words_utf8_fused(chunk))
1707 .collect();
1708 let mut line_total = 0u64;
1709 let mut word_total = 0u64;
1710 for (l, w) in results {
1711 line_total += l;
1712 word_total += w;
1713 }
1714 (line_total, word_total)
1715 }
1716 };
1717
1718 (lines, words, data.len() as u64)
1719}
1720
1721pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1725 if data.len() < PARALLEL_THRESHOLD {
1726 let lines = count_lines(data);
1727 let words = count_words_locale(data, utf8);
1728 let chars = count_chars(data, utf8);
1729 return (lines, words, chars);
1730 }
1731
1732 let num_threads = rayon::current_num_threads().max(1);
1733
1734 if utf8 {
1735 let chunks = split_at_newlines(data, num_threads);
1737 let results: Vec<(u64, u64, u64)> = chunks
1738 .par_iter()
1739 .map(|chunk| {
1740 let (lines, words) = count_lines_words_utf8_fused(chunk);
1741 let chars = count_chars_utf8(chunk);
1742 (lines, words, chars)
1743 })
1744 .collect();
1745 let mut lines = 0u64;
1746 let mut words = 0u64;
1747 let mut chars = 0u64;
1748 for (l, w, c) in results {
1749 lines += l;
1750 words += w;
1751 chars += c;
1752 }
1753 (lines, words, chars)
1754 } else {
1755 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1757 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1758 let results: Vec<(u64, u64, bool, bool)> = chunks
1759 .par_iter()
1760 .map(|chunk| count_lw_c_chunk_fast(chunk))
1761 .collect();
1762 let mut lines = 0u64;
1763 let mut words = 0u64;
1764 for i in 0..results.len() {
1765 lines += results[i].0;
1766 words += results[i].1;
1767 if i > 0 && results[i - 1].3 && results[i].2 {
1768 words -= 1;
1769 }
1770 }
1771 (lines, words, data.len() as u64)
1772 }
1773}
1774
1775pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
1779 if data.len() < PARALLEL_THRESHOLD {
1780 return max_line_length(data, utf8);
1781 }
1782 let num_threads = rayon::current_num_threads().max(1);
1783 let chunks = split_at_newlines(data, num_threads);
1784 chunks
1785 .par_iter()
1786 .map(|chunk| {
1787 if utf8 {
1788 max_line_length_utf8(chunk)
1789 } else {
1790 max_line_length_c(chunk)
1791 }
1792 })
1793 .max()
1794 .unwrap_or(0)
1795}
1796
1797pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
1801 if data.len() < PARALLEL_THRESHOLD {
1802 return count_all(data, utf8);
1803 }
1804
1805 let num_threads = rayon::current_num_threads().max(1);
1806 let chunks = split_at_newlines(data, num_threads);
1807
1808 if utf8 {
1809 let results: Vec<(u64, u64, u64, u64)> = chunks
1810 .par_iter()
1811 .map(|chunk| {
1812 let (lines, words) = count_lines_words_utf8_fused(chunk);
1813 let chars = count_chars_utf8(chunk);
1814 let max_ll = max_line_length_utf8(chunk);
1815 (lines, words, chars, max_ll)
1816 })
1817 .collect();
1818
1819 let mut counts = WcCounts {
1820 bytes: data.len() as u64,
1821 ..Default::default()
1822 };
1823 for (l, w, c, m) in results {
1824 counts.lines += l;
1825 counts.words += w;
1826 counts.chars += c;
1827 if m > counts.max_line_length {
1828 counts.max_line_length = m;
1829 }
1830 }
1831 counts
1832 } else {
1833 let results: Vec<(u64, u64, u64)> = chunks
1835 .par_iter()
1836 .map(|chunk| {
1837 let (lines, words) = count_lines_words(chunk, false);
1838 let max_ll = max_line_length_c(chunk);
1839 (lines, words, max_ll)
1840 })
1841 .collect();
1842
1843 let mut counts = WcCounts {
1844 bytes: data.len() as u64,
1845 chars: data.len() as u64,
1846 ..Default::default()
1847 };
1848 for (l, w, m) in &results {
1849 counts.lines += l;
1850 counts.words += w;
1851 if *m > counts.max_line_length {
1852 counts.max_line_length = *m;
1853 }
1854 }
1855 counts
1856 }
1857}