1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
8
9#[derive(Debug, Clone, Default, PartialEq, Eq)]
11pub struct WcCounts {
12 pub lines: u64,
13 pub words: u64,
14 pub bytes: u64,
15 pub chars: u64,
16 pub max_line_length: u64,
17}
18
19const fn make_byte_class_c() -> [u8; 256] {
36 let mut t = [2u8; 256]; t[0x09] = 1; t[0x0A] = 1; t[0x0B] = 1; t[0x0C] = 1; t[0x0D] = 1; t[0x20] = 1; t[0x00] = 0;
48 let mut i = 0x21u16;
50 while i <= 0x7E {
51 t[i as usize] = 0;
52 i += 1;
53 }
54 t
55}
56
57const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
58
59const fn make_byte_class_utf8() -> [u8; 256] {
62 let mut t = [2u8; 256]; t[0x09] = 1; t[0x0A] = 1; t[0x0B] = 1; t[0x0C] = 1; t[0x0D] = 1; t[0x20] = 1; let mut i = 0x21u16;
72 while i <= 0x7E {
73 t[i as usize] = 0;
74 i += 1;
75 }
76 t
77}
78
79const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
80
81#[inline]
88fn is_unicode_space(cp: u32) -> bool {
89 matches!(
90 cp,
91 0x00A0 | 0x1680 | 0x2000
94 ..=0x200A | 0x2028 | 0x2029 | 0x202F | 0x205F | 0x3000 )
101}
102
103#[inline]
107fn is_unicode_printable(cp: u32) -> bool {
108 cp >= 0xA0
109}
110
111#[inline]
118pub fn count_lines(data: &[u8]) -> u64 {
119 memchr_iter(b'\n', data).count() as u64
120}
121
122#[inline]
124pub fn count_bytes(data: &[u8]) -> u64 {
125 data.len() as u64
126}
127
128pub fn count_words(data: &[u8]) -> u64 {
130 count_words_locale(data, true)
131}
132
133pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
140 if utf8 {
141 count_words_utf8(data)
142 } else {
143 count_words_c(data)
144 }
145}
146
147fn count_words_c(data: &[u8]) -> u64 {
153 let mut words = 0u64;
154 let mut in_word = false;
155 let mut i = 0;
156 let len = data.len();
157
158 while i < len {
159 let b = unsafe { *data.get_unchecked(i) };
160 if b >= 0x21 && b <= 0x7E {
161 if !in_word {
163 in_word = true;
164 words += 1;
165 }
166 i += 1;
167 while i < len {
169 let b = unsafe { *data.get_unchecked(i) };
170 if b >= 0x21 && b <= 0x7E {
171 i += 1;
172 } else {
173 break;
174 }
175 }
176 } else {
177 let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
178 if class == 1 {
179 in_word = false;
180 } else if class == 0 {
181 if !in_word {
183 in_word = true;
184 words += 1;
185 }
186 }
187 i += 1;
189 }
190 }
191 words
192}
193
194fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
198 let mut lines = 0u64;
199 let mut words = 0u64;
200 let mut in_word = false;
201 let mut first_active_is_printable = false;
202 let mut seen_active = false;
203 let mut i = 0;
204 let len = data.len();
205
206 while i < len {
207 let b = unsafe { *data.get_unchecked(i) };
208 if b >= 0x21 && b <= 0x7E {
209 if !seen_active {
211 seen_active = true;
212 first_active_is_printable = true;
213 }
214 if !in_word {
215 in_word = true;
216 words += 1;
217 }
218 i += 1;
219 while i < len {
221 let b = unsafe { *data.get_unchecked(i) };
222 if b >= 0x21 && b <= 0x7E {
223 i += 1;
224 } else {
225 break;
226 }
227 }
228 } else if b == b'\n' {
229 lines += 1;
230 if !seen_active {
231 seen_active = true;
232 }
233 in_word = false;
234 i += 1;
235 } else {
236 let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
237 if class == 1 {
238 if !seen_active {
239 seen_active = true;
240 }
241 in_word = false;
242 } else if class == 0 {
243 if !seen_active {
245 seen_active = true;
246 first_active_is_printable = true;
247 }
248 if !in_word {
249 in_word = true;
250 words += 1;
251 }
252 }
253 i += 1;
254 }
255 }
256 (lines, words, first_active_is_printable, in_word)
257}
258
259fn count_words_utf8(data: &[u8]) -> u64 {
272 let mut words = 0u64;
273 let mut in_word = false;
274 let mut i = 0;
275 let len = data.len();
276
277 while i < len {
278 let b = unsafe { *data.get_unchecked(i) };
279
280 if b >= 0x21 && b <= 0x7E {
281 if !in_word {
283 in_word = true;
284 words += 1;
285 }
286 i += 1;
287 while i < len {
289 let b = unsafe { *data.get_unchecked(i) };
290 if b >= 0x21 && b <= 0x7E {
291 i += 1;
292 } else {
293 break;
294 }
295 }
296 } else if b < 0x80 {
297 let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
299 if class == 1 {
300 in_word = false;
301 }
302 i += 1;
304 } else if b < 0xC2 {
305 i += 1;
306 } else if b < 0xE0 {
307 if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
308 let cp = ((b as u32 & 0x1F) << 6)
309 | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
310 if is_unicode_space(cp) {
311 in_word = false;
312 } else if is_unicode_printable(cp) {
313 if !in_word {
314 in_word = true;
315 words += 1;
316 }
317 }
318 i += 2;
319 } else {
320 i += 1;
321 }
322 } else if b < 0xF0 {
323 if i + 2 < len
324 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
325 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
326 {
327 let cp = ((b as u32 & 0x0F) << 12)
328 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
329 | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
330 if is_unicode_space(cp) {
331 in_word = false;
332 } else if is_unicode_printable(cp) {
333 if !in_word {
334 in_word = true;
335 words += 1;
336 }
337 }
338 i += 3;
339 } else {
340 i += 1;
341 }
342 } else if b < 0xF5 {
343 if i + 3 < len
344 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
345 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
346 && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
347 {
348 let cp = ((b as u32 & 0x07) << 18)
349 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
350 | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
351 | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
352 if is_unicode_space(cp) {
353 in_word = false;
354 } else if is_unicode_printable(cp) {
355 if !in_word {
356 in_word = true;
357 words += 1;
358 }
359 }
360 i += 4;
361 } else {
362 i += 1;
363 }
364 } else {
365 i += 1;
366 }
367 }
368
369 words
370}
371
372pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
376 if utf8 {
377 count_lines_words_utf8_fused(data)
378 } else {
379 let mut lines = 0u64;
380 let mut words = 0u64;
381 let mut in_word = false;
382 let mut i = 0;
383 let len = data.len();
384
385 while i < len {
386 let b = unsafe { *data.get_unchecked(i) };
387 if b >= 0x21 && b <= 0x7E {
388 if !in_word {
390 in_word = true;
391 words += 1;
392 }
393 i += 1;
394 while i < len {
395 let b = unsafe { *data.get_unchecked(i) };
396 if b >= 0x21 && b <= 0x7E {
397 i += 1;
398 } else {
399 break;
400 }
401 }
402 } else if b == b'\n' {
403 lines += 1;
404 in_word = false;
405 i += 1;
406 } else {
407 let class = unsafe { *BYTE_CLASS_C.get_unchecked(b as usize) };
408 if class == 1 {
409 in_word = false;
410 }
411 i += 1;
412 }
413 }
414 (lines, words)
415 }
416}
417
418fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
425 let mut lines = 0u64;
426 let mut words = 0u64;
427 let mut in_word = false;
428 let mut i = 0;
429 let len = data.len();
430
431 while i < len {
432 let b = unsafe { *data.get_unchecked(i) };
433
434 if b >= 0x21 && b <= 0x7E {
435 if !in_word {
437 in_word = true;
438 words += 1;
439 }
440 i += 1;
441 while i < len {
443 let b = unsafe { *data.get_unchecked(i) };
444 if b >= 0x21 && b <= 0x7E {
445 i += 1;
446 } else {
447 break;
448 }
449 }
450 } else if b == b'\n' {
451 lines += 1;
452 in_word = false;
453 i += 1;
454 } else if b == b' ' {
455 in_word = false;
456 i += 1;
457 } else if b < 0x80 {
458 let class = unsafe { *BYTE_CLASS_UTF8.get_unchecked(b as usize) };
460 if class == 1 {
461 in_word = false;
462 }
463 i += 1;
465 } else if b < 0xC2 {
466 i += 1;
467 } else if b < 0xE0 {
468 if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
469 let cp = ((b as u32 & 0x1F) << 6)
470 | (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
471 if is_unicode_space(cp) {
472 in_word = false;
473 } else if is_unicode_printable(cp) {
474 if !in_word {
475 in_word = true;
476 words += 1;
477 }
478 }
479 i += 2;
480 } else {
481 i += 1;
482 }
483 } else if b < 0xF0 {
484 if i + 2 < len
485 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
486 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
487 {
488 let cp = ((b as u32 & 0x0F) << 12)
489 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
490 | (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
491 if is_unicode_space(cp) {
492 in_word = false;
493 } else if is_unicode_printable(cp) {
494 if !in_word {
495 in_word = true;
496 words += 1;
497 }
498 }
499 i += 3;
500 } else {
501 i += 1;
502 }
503 } else if b < 0xF5 {
504 if i + 3 < len
505 && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
506 && (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
507 && (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
508 {
509 let cp = ((b as u32 & 0x07) << 18)
510 | ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
511 | ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
512 | (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
513 if is_unicode_space(cp) {
514 in_word = false;
515 } else if is_unicode_printable(cp) {
516 if !in_word {
517 in_word = true;
518 words += 1;
519 }
520 }
521 i += 4;
522 } else {
523 i += 1;
524 }
525 } else {
526 i += 1;
527 }
528 }
529
530 (lines, words)
531}
532
533pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
535 if utf8 {
536 let (lines, words) = count_lines_words_utf8_fused(data);
538 let chars = count_chars_utf8(data);
539 (lines, words, chars)
540 } else {
541 let (lines, words) = count_lines_words(data, false);
543 (lines, words, data.len() as u64)
544 }
545}
546
547pub fn count_chars_utf8(data: &[u8]) -> u64 {
554 #[cfg(target_arch = "x86_64")]
555 {
556 if is_x86_feature_detected!("avx2") {
557 return unsafe { count_chars_utf8_avx2(data) };
558 }
559 }
560 count_chars_utf8_scalar(data)
561}
562
563#[cfg(target_arch = "x86_64")]
567#[target_feature(enable = "avx2")]
568unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
569 unsafe {
570 use std::arch::x86_64::*;
571
572 let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
573 let val_80 = _mm256_set1_epi8(0x80u8 as i8);
574 let ones = _mm256_set1_epi8(1);
575 let zero = _mm256_setzero_si256();
576
577 let mut total = 0u64;
578 let len = data.len();
579 let ptr = data.as_ptr();
580 let mut i = 0;
581 let mut acc = _mm256_setzero_si256();
582 let mut batch = 0u32;
583
584 while i + 32 <= len {
585 let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
586 let masked = _mm256_and_si256(v, mask_c0);
587 let is_cont = _mm256_cmpeq_epi8(masked, val_80);
588 let non_cont = _mm256_andnot_si256(is_cont, ones);
589 acc = _mm256_add_epi8(acc, non_cont);
590
591 batch += 1;
592 if batch >= 255 {
593 let sad = _mm256_sad_epu8(acc, zero);
595 let hi = _mm256_extracti128_si256(sad, 1);
596 let lo = _mm256_castsi256_si128(sad);
597 let sum = _mm_add_epi64(lo, hi);
598 let hi64 = _mm_unpackhi_epi64(sum, sum);
599 let t = _mm_add_epi64(sum, hi64);
600 total += _mm_cvtsi128_si64(t) as u64;
601 acc = _mm256_setzero_si256();
602 batch = 0;
603 }
604 i += 32;
605 }
606
607 if batch > 0 {
609 let sad = _mm256_sad_epu8(acc, zero);
610 let hi = _mm256_extracti128_si256(sad, 1);
611 let lo = _mm256_castsi256_si128(sad);
612 let sum = _mm_add_epi64(lo, hi);
613 let hi64 = _mm_unpackhi_epi64(sum, sum);
614 let t = _mm_add_epi64(sum, hi64);
615 total += _mm_cvtsi128_si64(t) as u64;
616 }
617
618 while i < len {
619 total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
620 i += 1;
621 }
622
623 total
624 }
625}
626
627fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
629 let mut count = 0u64;
630 let chunks = data.chunks_exact(64);
631 let remainder = chunks.remainder();
632
633 for chunk in chunks {
634 let mut any_high = 0u8;
636 let mut i = 0;
637 while i + 8 <= 64 {
638 unsafe {
639 any_high |= *chunk.get_unchecked(i);
640 any_high |= *chunk.get_unchecked(i + 1);
641 any_high |= *chunk.get_unchecked(i + 2);
642 any_high |= *chunk.get_unchecked(i + 3);
643 any_high |= *chunk.get_unchecked(i + 4);
644 any_high |= *chunk.get_unchecked(i + 5);
645 any_high |= *chunk.get_unchecked(i + 6);
646 any_high |= *chunk.get_unchecked(i + 7);
647 }
648 i += 8;
649 }
650 if any_high < 0x80 {
651 count += 64;
652 continue;
653 }
654
655 let mut char_mask = 0u64;
656 i = 0;
657 while i + 7 < 64 {
658 unsafe {
659 char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
660 char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
661 char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
662 char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
663 char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
664 char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
665 char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
666 char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
667 }
668 i += 8;
669 }
670 count += char_mask.count_ones() as u64;
671 }
672
673 for &b in remainder {
674 count += ((b & 0xC0) != 0x80) as u64;
675 }
676 count
677}
678
679#[inline]
681pub fn count_chars_c(data: &[u8]) -> u64 {
682 data.len() as u64
683}
684
685#[inline]
687pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
688 if utf8 {
689 count_chars_utf8(data)
690 } else {
691 count_chars_c(data)
692 }
693}
694
695pub fn is_utf8_locale() -> bool {
697 for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
698 if let Ok(val) = std::env::var(var) {
699 if !val.is_empty() {
700 let lower = val.to_ascii_lowercase();
701 return lower.contains("utf-8") || lower.contains("utf8");
702 }
703 }
704 }
705 false
706}
707
708#[inline]
711fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
712 let b0 = bytes[0];
713 if b0 < 0x80 {
714 return (b0 as u32, 1);
715 }
716 if b0 < 0xC2 {
717 return (b0 as u32, 1);
719 }
720 if b0 < 0xE0 {
721 if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
722 return (b0 as u32, 1);
723 }
724 let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
725 return (cp, 2);
726 }
727 if b0 < 0xF0 {
728 if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
729 return (b0 as u32, 1);
730 }
731 let cp =
732 ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
733 return (cp, 3);
734 }
735 if b0 < 0xF5 {
736 if bytes.len() < 4
737 || bytes[1] & 0xC0 != 0x80
738 || bytes[2] & 0xC0 != 0x80
739 || bytes[3] & 0xC0 != 0x80
740 {
741 return (b0 as u32, 1);
742 }
743 let cp = ((b0 as u32 & 0x07) << 18)
744 | ((bytes[1] as u32 & 0x3F) << 12)
745 | ((bytes[2] as u32 & 0x3F) << 6)
746 | (bytes[3] as u32 & 0x3F);
747 return (cp, 4);
748 }
749 (b0 as u32, 1)
750}
751
752#[inline]
755fn is_zero_width(cp: u32) -> bool {
756 matches!(
757 cp,
758 0x0300..=0x036F | 0x0483..=0x0489 | 0x0591..=0x05BD | 0x05BF
762 | 0x05C1..=0x05C2
763 | 0x05C4..=0x05C5
764 | 0x05C7
765 | 0x0600..=0x0605 | 0x0610..=0x061A | 0x064B..=0x065F | 0x0670
769 | 0x06D6..=0x06DD
770 | 0x06DF..=0x06E4
771 | 0x06E7..=0x06E8
772 | 0x06EA..=0x06ED
773 | 0x070F
774 | 0x0711
775 | 0x0730..=0x074A
776 | 0x07A6..=0x07B0
777 | 0x07EB..=0x07F3
778 | 0x07FD
779 | 0x0816..=0x0819
780 | 0x081B..=0x0823
781 | 0x0825..=0x0827
782 | 0x0829..=0x082D
783 | 0x0859..=0x085B
784 | 0x08D3..=0x08E1
785 | 0x08E3..=0x0902
786 | 0x093A
787 | 0x093C
788 | 0x0941..=0x0948
789 | 0x094D
790 | 0x0951..=0x0957
791 | 0x0962..=0x0963
792 | 0x0981
793 | 0x09BC
794 | 0x09C1..=0x09C4
795 | 0x09CD
796 | 0x09E2..=0x09E3
797 | 0x09FE
798 | 0x0A01..=0x0A02
799 | 0x0A3C
800 | 0x0A41..=0x0A42
801 | 0x0A47..=0x0A48
802 | 0x0A4B..=0x0A4D
803 | 0x0A51
804 | 0x0A70..=0x0A71
805 | 0x0A75
806 | 0x0A81..=0x0A82
807 | 0x0ABC
808 | 0x0AC1..=0x0AC5
809 | 0x0AC7..=0x0AC8
810 | 0x0ACD
811 | 0x0AE2..=0x0AE3
812 | 0x0AFA..=0x0AFF
813 | 0x0B01
814 | 0x0B3C
815 | 0x0B3F
816 | 0x0B41..=0x0B44
817 | 0x0B4D
818 | 0x0B56
819 | 0x0B62..=0x0B63
820 | 0x0B82
821 | 0x0BC0
822 | 0x0BCD
823 | 0x0C00
824 | 0x0C04
825 | 0x0C3E..=0x0C40
826 | 0x0C46..=0x0C48
827 | 0x0C4A..=0x0C4D
828 | 0x0C55..=0x0C56
829 | 0x0C62..=0x0C63
830 | 0x0C81
831 | 0x0CBC
832 | 0x0CBF
833 | 0x0CC6
834 | 0x0CCC..=0x0CCD
835 | 0x0CE2..=0x0CE3
836 | 0x0D00..=0x0D01
837 | 0x0D3B..=0x0D3C
838 | 0x0D41..=0x0D44
839 | 0x0D4D
840 | 0x0D62..=0x0D63
841 | 0x0DCA
842 | 0x0DD2..=0x0DD4
843 | 0x0DD6
844 | 0x0E31
845 | 0x0E34..=0x0E3A
846 | 0x0E47..=0x0E4E
847 | 0x0EB1
848 | 0x0EB4..=0x0EBC
849 | 0x0EC8..=0x0ECD
850 | 0x0F18..=0x0F19
851 | 0x0F35
852 | 0x0F37
853 | 0x0F39
854 | 0x0F71..=0x0F7E
855 | 0x0F80..=0x0F84
856 | 0x0F86..=0x0F87
857 | 0x0F8D..=0x0F97
858 | 0x0F99..=0x0FBC
859 | 0x0FC6
860 | 0x102D..=0x1030
861 | 0x1032..=0x1037
862 | 0x1039..=0x103A
863 | 0x103D..=0x103E
864 | 0x1058..=0x1059
865 | 0x105E..=0x1060
866 | 0x1071..=0x1074
867 | 0x1082
868 | 0x1085..=0x1086
869 | 0x108D
870 | 0x109D
871 | 0x1160..=0x11FF | 0x135D..=0x135F
873 | 0x1712..=0x1714
874 | 0x1732..=0x1734
875 | 0x1752..=0x1753
876 | 0x1772..=0x1773
877 | 0x17B4..=0x17B5
878 | 0x17B7..=0x17BD
879 | 0x17C6
880 | 0x17C9..=0x17D3
881 | 0x17DD
882 | 0x180B..=0x180D
883 | 0x1885..=0x1886
884 | 0x18A9
885 | 0x1920..=0x1922
886 | 0x1927..=0x1928
887 | 0x1932
888 | 0x1939..=0x193B
889 | 0x1A17..=0x1A18
890 | 0x1A1B
891 | 0x1A56
892 | 0x1A58..=0x1A5E
893 | 0x1A60
894 | 0x1A62
895 | 0x1A65..=0x1A6C
896 | 0x1A73..=0x1A7C
897 | 0x1A7F
898 | 0x1AB0..=0x1ABE
899 | 0x1B00..=0x1B03
900 | 0x1B34
901 | 0x1B36..=0x1B3A
902 | 0x1B3C
903 | 0x1B42
904 | 0x1B6B..=0x1B73
905 | 0x1B80..=0x1B81
906 | 0x1BA2..=0x1BA5
907 | 0x1BA8..=0x1BA9
908 | 0x1BAB..=0x1BAD
909 | 0x1BE6
910 | 0x1BE8..=0x1BE9
911 | 0x1BED
912 | 0x1BEF..=0x1BF1
913 | 0x1C2C..=0x1C33
914 | 0x1C36..=0x1C37
915 | 0x1CD0..=0x1CD2
916 | 0x1CD4..=0x1CE0
917 | 0x1CE2..=0x1CE8
918 | 0x1CED
919 | 0x1CF4
920 | 0x1CF8..=0x1CF9
921 | 0x1DC0..=0x1DF9
922 | 0x1DFB..=0x1DFF
923 | 0x200B..=0x200F | 0x202A..=0x202E | 0x2060..=0x2064 | 0x2066..=0x206F | 0x20D0..=0x20F0 | 0xFE00..=0xFE0F | 0xFE20..=0xFE2F | 0xFEFF | 0xFFF9..=0xFFFB | 0x1D167..=0x1D169
933 | 0x1D173..=0x1D182
934 | 0x1D185..=0x1D18B
935 | 0x1D1AA..=0x1D1AD
936 | 0x1D242..=0x1D244
937 | 0xE0001
938 | 0xE0020..=0xE007F
939 | 0xE0100..=0xE01EF )
941}
942
943#[inline]
946fn is_wide_char(cp: u32) -> bool {
947 matches!(
948 cp,
949 0x1100..=0x115F | 0x231A..=0x231B | 0x2329..=0x232A | 0x23E9..=0x23F3 | 0x23F8..=0x23FA
954 | 0x25FD..=0x25FE
955 | 0x2614..=0x2615
956 | 0x2648..=0x2653
957 | 0x267F
958 | 0x2693
959 | 0x26A1
960 | 0x26AA..=0x26AB
961 | 0x26BD..=0x26BE
962 | 0x26C4..=0x26C5
963 | 0x26CE
964 | 0x26D4
965 | 0x26EA
966 | 0x26F2..=0x26F3
967 | 0x26F5
968 | 0x26FA
969 | 0x26FD
970 | 0x2702
971 | 0x2705
972 | 0x2708..=0x270D
973 | 0x270F
974 | 0x2712
975 | 0x2714
976 | 0x2716
977 | 0x271D
978 | 0x2721
979 | 0x2728
980 | 0x2733..=0x2734
981 | 0x2744
982 | 0x2747
983 | 0x274C
984 | 0x274E
985 | 0x2753..=0x2755
986 | 0x2757
987 | 0x2763..=0x2764
988 | 0x2795..=0x2797
989 | 0x27A1
990 | 0x27B0
991 | 0x27BF
992 | 0x2934..=0x2935
993 | 0x2B05..=0x2B07
994 | 0x2B1B..=0x2B1C
995 | 0x2B50
996 | 0x2B55
997 | 0x2E80..=0x303E | 0x3040..=0x33BF | 0x3400..=0x4DBF | 0x4E00..=0xA4CF | 0xA960..=0xA97C | 0xAC00..=0xD7A3 | 0xF900..=0xFAFF | 0xFE10..=0xFE19 | 0xFE30..=0xFE6F | 0xFF01..=0xFF60 | 0xFFE0..=0xFFE6 | 0x1F004
1009 | 0x1F0CF
1010 | 0x1F170..=0x1F171
1011 | 0x1F17E..=0x1F17F
1012 | 0x1F18E
1013 | 0x1F191..=0x1F19A
1014 | 0x1F1E0..=0x1F1FF | 0x1F200..=0x1F202
1016 | 0x1F210..=0x1F23B
1017 | 0x1F240..=0x1F248
1018 | 0x1F250..=0x1F251
1019 | 0x1F260..=0x1F265
1020 | 0x1F300..=0x1F64F | 0x1F680..=0x1F6FF | 0x1F900..=0x1F9FF | 0x1FA00..=0x1FA6F
1024 | 0x1FA70..=0x1FAFF
1025 | 0x20000..=0x2FFFD | 0x30000..=0x3FFFD )
1028}
1029
1030pub fn max_line_length_c(data: &[u8]) -> u64 {
1043 let mut max_len: u64 = 0;
1044 let mut line_len: u64 = 0;
1045 let mut linepos: u64 = 0;
1046 let mut i = 0;
1047 let len = data.len();
1048
1049 while i < len {
1050 let b = unsafe { *data.get_unchecked(i) };
1051 if b >= 0x21 && b <= 0x7E {
1052 i += 1;
1054 let mut run = 1u64;
1055 while i < len {
1056 let b = unsafe { *data.get_unchecked(i) };
1057 if b >= 0x21 && b <= 0x7E {
1058 run += 1;
1059 i += 1;
1060 } else {
1061 break;
1062 }
1063 }
1064 linepos += run;
1065 if linepos > line_len {
1066 line_len = linepos;
1067 }
1068 } else {
1069 match b {
1070 b' ' => {
1071 linepos += 1;
1072 if linepos > line_len {
1073 line_len = linepos;
1074 }
1075 }
1076 b'\n' => {
1077 if line_len > max_len {
1078 max_len = line_len;
1079 }
1080 linepos = 0;
1081 line_len = 0;
1082 }
1083 b'\t' => {
1084 linepos = (linepos + 8) & !7;
1085 if linepos > line_len {
1086 line_len = linepos;
1087 }
1088 }
1089 b'\r' => {
1090 linepos = 0;
1091 }
1092 0x0C => {
1093 if line_len > max_len {
1094 max_len = line_len;
1095 }
1096 linepos = 0;
1097 line_len = 0;
1098 }
1099 _ => {} }
1101 i += 1;
1102 }
1103 }
1104
1105 if line_len > max_len {
1106 max_len = line_len;
1107 }
1108
1109 max_len
1110}
1111
1112pub fn max_line_length_utf8(data: &[u8]) -> u64 {
1119 let mut max_len: u64 = 0;
1120 let mut line_len: u64 = 0;
1121 let mut linepos: u64 = 0;
1122 let mut i = 0;
1123 let len = data.len();
1124
1125 while i < len {
1126 let b = unsafe { *data.get_unchecked(i) };
1127
1128 if b >= 0x21 && b <= 0x7E {
1129 i += 1;
1131 let mut run = 1u64;
1132 while i < len {
1133 let b = unsafe { *data.get_unchecked(i) };
1134 if b >= 0x21 && b <= 0x7E {
1135 run += 1;
1136 i += 1;
1137 } else {
1138 break;
1139 }
1140 }
1141 linepos += run;
1142 if linepos > line_len {
1143 line_len = linepos;
1144 }
1145 } else if b < 0x80 {
1146 match b {
1148 b' ' => {
1149 linepos += 1;
1150 if linepos > line_len {
1151 line_len = linepos;
1152 }
1153 }
1154 b'\n' => {
1155 if line_len > max_len {
1156 max_len = line_len;
1157 }
1158 linepos = 0;
1159 line_len = 0;
1160 }
1161 b'\t' => {
1162 linepos = (linepos + 8) & !7;
1163 if linepos > line_len {
1164 line_len = linepos;
1165 }
1166 }
1167 b'\r' => {
1168 linepos = 0;
1169 }
1170 0x0C => {
1171 if line_len > max_len {
1172 max_len = line_len;
1173 }
1174 linepos = 0;
1175 line_len = 0;
1176 }
1177 _ => {} }
1179 i += 1;
1180 } else {
1181 let (cp, len) = decode_utf8(&data[i..]);
1183
1184 if cp <= 0x9F {
1186 } else if is_zero_width(cp) {
1188 } else if is_wide_char(cp) {
1190 linepos += 2;
1191 if linepos > line_len {
1192 line_len = linepos;
1193 }
1194 } else {
1195 linepos += 1;
1197 if linepos > line_len {
1198 line_len = linepos;
1199 }
1200 }
1201 i += len;
1202 }
1203 }
1204
1205 if line_len > max_len {
1207 max_len = line_len;
1208 }
1209
1210 max_len
1211}
1212
1213#[inline]
1215pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1216 if utf8 {
1217 max_line_length_utf8(data)
1218 } else {
1219 max_line_length_c(data)
1220 }
1221}
1222
1223pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1235 if utf8 {
1236 let (lines, words) = count_lines_words_utf8_fused(data);
1237 WcCounts {
1238 lines,
1239 words,
1240 bytes: data.len() as u64,
1241 chars: count_chars_utf8(data),
1242 max_line_length: max_line_length_utf8(data),
1243 }
1244 } else {
1245 WcCounts {
1246 lines: count_lines(data),
1247 words: count_words_locale(data, false),
1248 bytes: data.len() as u64,
1249 chars: data.len() as u64,
1250 max_line_length: max_line_length_c(data),
1251 }
1252 }
1253}
1254
1255#[inline]
1259fn check_ascii_sample(data: &[u8]) -> bool {
1260 let len = data.len();
1261 if len == 0 {
1262 return true;
1263 }
1264
1265 let check_region = |start: usize, end: usize| -> bool {
1267 let mut or_acc = 0u8;
1268 let region = &data[start..end];
1269 let mut i = 0;
1270 while i + 8 <= region.len() {
1271 unsafe {
1272 or_acc |= *region.get_unchecked(i);
1273 or_acc |= *region.get_unchecked(i + 1);
1274 or_acc |= *region.get_unchecked(i + 2);
1275 or_acc |= *region.get_unchecked(i + 3);
1276 or_acc |= *region.get_unchecked(i + 4);
1277 or_acc |= *region.get_unchecked(i + 5);
1278 or_acc |= *region.get_unchecked(i + 6);
1279 or_acc |= *region.get_unchecked(i + 7);
1280 }
1281 i += 8;
1282 }
1283 while i < region.len() {
1284 or_acc |= region[i];
1285 i += 1;
1286 }
1287 or_acc < 0x80
1288 };
1289
1290 let sample = 256.min(len);
1291
1292 if !check_region(0, sample) {
1294 return false;
1295 }
1296 if len > sample * 2 {
1298 let mid = len / 2;
1299 let mid_start = mid.saturating_sub(sample / 2);
1300 if !check_region(mid_start, (mid_start + sample).min(len)) {
1301 return false;
1302 }
1303 }
1304 if len > sample {
1306 if !check_region(len - sample, len) {
1307 return false;
1308 }
1309 }
1310
1311 true
1312}
1313
1314pub fn count_lines_parallel(data: &[u8]) -> u64 {
1321 if data.len() < PARALLEL_THRESHOLD {
1322 return count_lines(data);
1323 }
1324
1325 let num_threads = rayon::current_num_threads().max(1);
1326 let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
1328
1329 data.par_chunks(chunk_size)
1330 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1331 .sum()
1332}
1333
1334pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1336 if utf8 || data.len() < PARALLEL_THRESHOLD {
1337 return count_words_locale(data, utf8);
1340 }
1341
1342 let num_threads = rayon::current_num_threads().max(1);
1344 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1345
1346 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1347
1348 let results: Vec<(u64, u64, bool, bool)> = chunks
1350 .par_iter()
1351 .map(|chunk| count_lw_c_chunk(chunk))
1352 .collect();
1353
1354 let mut total = 0u64;
1355 for i in 0..results.len() {
1356 total += results[i].1;
1357 if i > 0 && results[i - 1].3 && results[i].2 {
1361 total -= 1;
1362 }
1363 }
1364 total
1365}
1366
1367pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1369 if !utf8 {
1370 return data.len() as u64;
1371 }
1372 if data.len() < PARALLEL_THRESHOLD {
1373 return count_chars_utf8(data);
1374 }
1375
1376 let num_threads = rayon::current_num_threads().max(1);
1377 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1378
1379 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1380}
1381
1382pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1385 let (lines, words) = count_lines_words(data, utf8);
1386 (lines, words, data.len() as u64)
1387}
1388
1389pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1394 if data.len() < PARALLEL_THRESHOLD {
1395 return count_lwb(data, utf8);
1397 }
1398
1399 let effective_utf8 = if utf8 {
1403 let is_ascii = check_ascii_sample(data);
1405 if is_ascii {
1406 false } else {
1408 true }
1410 } else {
1411 false
1412 };
1413
1414 let (lines, words) = if effective_utf8 {
1415 count_lines_words_utf8_fused(data)
1417 } else {
1418 let num_threads = rayon::current_num_threads().max(1);
1420 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1421
1422 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1423 let results: Vec<(u64, u64, bool, bool)> = chunks
1424 .par_iter()
1425 .map(|chunk| count_lw_c_chunk(chunk))
1426 .collect();
1427
1428 let mut line_total = 0u64;
1429 let mut word_total = 0u64;
1430 for i in 0..results.len() {
1431 line_total += results[i].0;
1432 word_total += results[i].1;
1433 if i > 0 && results[i - 1].3 && results[i].2 {
1434 word_total -= 1;
1435 }
1436 }
1437
1438 (line_total, word_total)
1439 };
1440
1441 (lines, words, data.len() as u64)
1442}
1443
1444pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1446 if data.len() < PARALLEL_THRESHOLD {
1447 let lines = count_lines(data);
1448 let words = count_words_locale(data, utf8);
1449 let chars = count_chars(data, utf8);
1450 return (lines, words, chars);
1451 }
1452
1453 let words = count_words_parallel(data, utf8);
1455
1456 let num_threads = rayon::current_num_threads().max(1);
1458 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1459
1460 let lines: u64 = data
1461 .par_chunks(chunk_size)
1462 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1463 .sum();
1464
1465 let chars = if utf8 {
1466 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1467 } else {
1468 data.len() as u64
1469 };
1470
1471 (lines, words, chars)
1472}