1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8#[derive(Debug, Clone, Default, PartialEq, Eq)]
10pub struct WcCounts {
11 pub lines: u64,
12 pub words: u64,
13 pub bytes: u64,
14 pub chars: u64,
15 pub max_line_length: u64,
16}
17
18const fn make_byte_class_c() -> [u8; 256] {
35 let mut t = [2u8; 256]; t[0x09] = 1; t[0x0A] = 1; t[0x0B] = 1; t[0x0C] = 1; t[0x0D] = 1; t[0x20] = 1; let mut i = 0x21u16;
45 while i <= 0x7E {
46 t[i as usize] = 0;
47 i += 1;
48 }
49 t
50}
51
52const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
53
54const fn make_byte_class_utf8() -> [u8; 256] {
57 let mut t = [2u8; 256]; t[0x09] = 1; t[0x0A] = 1; t[0x0B] = 1; t[0x0C] = 1; t[0x0D] = 1; t[0x20] = 1; let mut i = 0x21u16;
67 while i <= 0x7E {
68 t[i as usize] = 0;
69 i += 1;
70 }
71 t
72}
73
74const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
75
76const fn make_printable_table() -> [u8; 256] {
78 let mut t = [0u8; 256];
79 let mut i = 0x20u16;
80 while i <= 0x7E {
81 t[i as usize] = 1;
82 i += 1;
83 }
84 t
85}
86
87const PRINTABLE_TABLE: [u8; 256] = make_printable_table();
88
89#[inline]
96fn is_unicode_space(cp: u32) -> bool {
97 matches!(
98 cp,
99 0x00A0 | 0x1680 | 0x2000
102 ..=0x200A | 0x2028 | 0x2029 | 0x202F | 0x205F | 0x3000 )
109}
110
111#[inline]
115fn is_unicode_printable(cp: u32) -> bool {
116 cp >= 0xA0
117}
118
119#[inline]
126pub fn count_lines(data: &[u8]) -> u64 {
127 memchr_iter(b'\n', data).count() as u64
128}
129
130#[inline]
132pub fn count_bytes(data: &[u8]) -> u64 {
133 data.len() as u64
134}
135
136pub fn count_words(data: &[u8]) -> u64 {
138 count_words_locale(data, true)
139}
140
141pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
148 if utf8 {
149 count_words_utf8(data)
150 } else {
151 count_words_c(data)
152 }
153}
154
155fn count_words_c(data: &[u8]) -> u64 {
159 let mut words = 0u64;
160 let mut in_word = false;
161 for &b in data {
162 let class = BYTE_CLASS_C[b as usize];
163 if class == 1 {
164 in_word = false;
166 } else if class == 0 {
167 if !in_word {
169 in_word = true;
170 words += 1;
171 }
172 }
173 }
175 words
176}
177
178fn count_words_c_chunk(data: &[u8]) -> (u64, bool, bool) {
182 let mut words = 0u64;
183 let mut in_word = false;
184 let mut first_active_is_printable = false;
185 let mut seen_active = false;
186
187 for &b in data {
188 let class = BYTE_CLASS_C[b as usize];
189 if class == 1 {
190 if !seen_active {
191 seen_active = true;
192 }
194 in_word = false;
195 } else if class == 0 {
196 if !seen_active {
197 seen_active = true;
198 first_active_is_printable = true;
199 }
200 if !in_word {
201 in_word = true;
202 words += 1;
203 }
204 }
205 }
206 (words, first_active_is_printable, in_word)
207}
208
209fn count_words_utf8(data: &[u8]) -> u64 {
218 let mut words = 0u64;
219 let mut in_word = false;
220 let mut i = 0;
221
222 while i < data.len() {
223 let b = data[i];
224
225 if b < 0x80 {
226 let class = BYTE_CLASS_UTF8[b as usize];
228 if class == 1 {
229 in_word = false;
230 } else if class == 0 {
231 if !in_word {
232 in_word = true;
233 words += 1;
234 }
235 }
236 i += 1;
238 } else if b < 0xC2 {
239 i += 1;
243 } else if b < 0xE0 {
244 if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
246 let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
247 if is_unicode_space(cp) {
248 in_word = false;
249 } else if is_unicode_printable(cp) {
250 if !in_word {
251 in_word = true;
252 words += 1;
253 }
254 }
255 i += 2;
257 } else {
258 i += 1;
260 }
261 } else if b < 0xF0 {
262 if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
264 let cp = ((b as u32 & 0x0F) << 12)
265 | ((data[i + 1] as u32 & 0x3F) << 6)
266 | (data[i + 2] as u32 & 0x3F);
267 if is_unicode_space(cp) {
268 in_word = false;
269 } else if is_unicode_printable(cp) {
270 if !in_word {
271 in_word = true;
272 words += 1;
273 }
274 }
275 i += 3;
276 } else {
277 i += 1;
279 }
280 } else if b < 0xF5 {
281 if i + 3 < data.len()
283 && (data[i + 1] & 0xC0) == 0x80
284 && (data[i + 2] & 0xC0) == 0x80
285 && (data[i + 3] & 0xC0) == 0x80
286 {
287 let cp = ((b as u32 & 0x07) << 18)
288 | ((data[i + 1] as u32 & 0x3F) << 12)
289 | ((data[i + 2] as u32 & 0x3F) << 6)
290 | (data[i + 3] as u32 & 0x3F);
291 if is_unicode_space(cp) {
292 in_word = false;
293 } else if is_unicode_printable(cp) {
294 if !in_word {
295 in_word = true;
296 words += 1;
297 }
298 }
299 i += 4;
300 } else {
301 i += 1;
303 }
304 } else {
305 i += 1;
307 }
308 }
309
310 words
311}
312
313pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
317 if utf8 {
318 count_lines_words_utf8_fused(data)
319 } else {
320 let mut lines = 0u64;
321 let mut words = 0u64;
322 let mut in_word = false;
323 for &b in data {
324 if b == b'\n' {
325 lines += 1;
326 }
327 let class = BYTE_CLASS_C[b as usize];
328 if class == 1 {
329 in_word = false;
330 } else if class == 0 {
331 if !in_word {
332 in_word = true;
333 words += 1;
334 }
335 }
336 }
337 (lines, words)
338 }
339}
340
341fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
344 let mut lines = 0u64;
345 let mut words = 0u64;
346 let mut in_word = false;
347 let mut i = 0;
348
349 while i < data.len() {
350 let b = data[i];
351
352 if b < 0x80 {
353 if b == b'\n' {
355 lines += 1;
356 in_word = false;
357 } else {
358 let class = BYTE_CLASS_UTF8[b as usize];
359 if class == 1 {
360 in_word = false;
361 } else if class == 0 {
362 if !in_word {
363 in_word = true;
364 words += 1;
365 }
366 }
367 }
368 i += 1;
369 } else if b < 0xC2 {
370 i += 1;
371 } else if b < 0xE0 {
372 if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
373 let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
374 if is_unicode_space(cp) {
375 in_word = false;
376 } else if is_unicode_printable(cp) {
377 if !in_word {
378 in_word = true;
379 words += 1;
380 }
381 }
382 i += 2;
383 } else {
384 i += 1;
385 }
386 } else if b < 0xF0 {
387 if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
388 let cp = ((b as u32 & 0x0F) << 12)
389 | ((data[i + 1] as u32 & 0x3F) << 6)
390 | (data[i + 2] as u32 & 0x3F);
391 if is_unicode_space(cp) {
392 in_word = false;
393 } else if is_unicode_printable(cp) {
394 if !in_word {
395 in_word = true;
396 words += 1;
397 }
398 }
399 i += 3;
400 } else {
401 i += 1;
402 }
403 } else if b < 0xF5 {
404 if i + 3 < data.len()
405 && (data[i + 1] & 0xC0) == 0x80
406 && (data[i + 2] & 0xC0) == 0x80
407 && (data[i + 3] & 0xC0) == 0x80
408 {
409 let cp = ((b as u32 & 0x07) << 18)
410 | ((data[i + 1] as u32 & 0x3F) << 12)
411 | ((data[i + 2] as u32 & 0x3F) << 6)
412 | (data[i + 3] as u32 & 0x3F);
413 if is_unicode_space(cp) {
414 in_word = false;
415 } else if is_unicode_printable(cp) {
416 if !in_word {
417 in_word = true;
418 words += 1;
419 }
420 }
421 i += 4;
422 } else {
423 i += 1;
424 }
425 } else {
426 i += 1;
427 }
428 }
429
430 (lines, words)
431}
432
433pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
435 if utf8 {
436 let (lines, words) = count_lines_words_utf8_fused(data);
438 let chars = count_chars_utf8(data);
439 (lines, words, chars)
440 } else {
441 let mut lines = 0u64;
443 let mut words = 0u64;
444 let mut in_word = false;
445 for &b in data {
446 if b == b'\n' {
447 lines += 1;
448 }
449 let class = BYTE_CLASS_C[b as usize];
450 if class == 1 {
451 in_word = false;
452 } else if class == 0 {
453 if !in_word {
454 in_word = true;
455 words += 1;
456 }
457 }
458 }
459 (lines, words, data.len() as u64)
460 }
461}
462
463pub fn count_chars_utf8(data: &[u8]) -> u64 {
469 let mut count = 0u64;
470 let chunks = data.chunks_exact(64);
471 let remainder = chunks.remainder();
472
473 for chunk in chunks {
474 let mut char_mask = 0u64;
476 let mut i = 0;
477 while i + 7 < 64 {
478 char_mask |= (((chunk[i] & 0xC0) != 0x80) as u64) << i;
479 char_mask |= (((chunk[i + 1] & 0xC0) != 0x80) as u64) << (i + 1);
480 char_mask |= (((chunk[i + 2] & 0xC0) != 0x80) as u64) << (i + 2);
481 char_mask |= (((chunk[i + 3] & 0xC0) != 0x80) as u64) << (i + 3);
482 char_mask |= (((chunk[i + 4] & 0xC0) != 0x80) as u64) << (i + 4);
483 char_mask |= (((chunk[i + 5] & 0xC0) != 0x80) as u64) << (i + 5);
484 char_mask |= (((chunk[i + 6] & 0xC0) != 0x80) as u64) << (i + 6);
485 char_mask |= (((chunk[i + 7] & 0xC0) != 0x80) as u64) << (i + 7);
486 i += 8;
487 }
488 count += char_mask.count_ones() as u64;
489 }
490
491 for &b in remainder {
492 count += ((b & 0xC0) != 0x80) as u64;
493 }
494 count
495}
496
497#[inline]
499pub fn count_chars_c(data: &[u8]) -> u64 {
500 data.len() as u64
501}
502
503#[inline]
505pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
506 if utf8 {
507 count_chars_utf8(data)
508 } else {
509 count_chars_c(data)
510 }
511}
512
513pub fn is_utf8_locale() -> bool {
515 for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
516 if let Ok(val) = std::env::var(var) {
517 if !val.is_empty() {
518 let lower = val.to_ascii_lowercase();
519 return lower.contains("utf-8") || lower.contains("utf8");
520 }
521 }
522 }
523 false
524}
525
526#[inline]
529fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
530 let b0 = bytes[0];
531 if b0 < 0x80 {
532 return (b0 as u32, 1);
533 }
534 if b0 < 0xC2 {
535 return (b0 as u32, 1);
537 }
538 if b0 < 0xE0 {
539 if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
540 return (b0 as u32, 1);
541 }
542 let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
543 return (cp, 2);
544 }
545 if b0 < 0xF0 {
546 if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
547 return (b0 as u32, 1);
548 }
549 let cp =
550 ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
551 return (cp, 3);
552 }
553 if b0 < 0xF5 {
554 if bytes.len() < 4
555 || bytes[1] & 0xC0 != 0x80
556 || bytes[2] & 0xC0 != 0x80
557 || bytes[3] & 0xC0 != 0x80
558 {
559 return (b0 as u32, 1);
560 }
561 let cp = ((b0 as u32 & 0x07) << 18)
562 | ((bytes[1] as u32 & 0x3F) << 12)
563 | ((bytes[2] as u32 & 0x3F) << 6)
564 | (bytes[3] as u32 & 0x3F);
565 return (cp, 4);
566 }
567 (b0 as u32, 1)
568}
569
570#[inline]
572fn is_wide_char(cp: u32) -> bool {
573 matches!(
574 cp,
575 0x1100..=0x115F | 0x231A..=0x231B | 0x2329..=0x232A | 0x23E9..=0x23F3 | 0x23F8..=0x23FA
580 | 0x25FD..=0x25FE
581 | 0x2614..=0x2615
582 | 0x2648..=0x2653
583 | 0x267F
584 | 0x2693
585 | 0x26A1
586 | 0x26AA..=0x26AB
587 | 0x26BD..=0x26BE
588 | 0x26C4..=0x26C5
589 | 0x26CE
590 | 0x26D4
591 | 0x26EA
592 | 0x26F2..=0x26F3
593 | 0x26F5
594 | 0x26FA
595 | 0x26FD
596 | 0x2702
597 | 0x2705
598 | 0x2708..=0x270D
599 | 0x270F
600 | 0x2712
601 | 0x2714
602 | 0x2716
603 | 0x271D
604 | 0x2721
605 | 0x2728
606 | 0x2733..=0x2734
607 | 0x2744
608 | 0x2747
609 | 0x274C
610 | 0x274E
611 | 0x2753..=0x2755
612 | 0x2757
613 | 0x2763..=0x2764
614 | 0x2795..=0x2797
615 | 0x27A1
616 | 0x27B0
617 | 0x27BF
618 | 0x2934..=0x2935
619 | 0x2B05..=0x2B07
620 | 0x2B1B..=0x2B1C
621 | 0x2B50
622 | 0x2B55
623 | 0x2E80..=0x303E | 0x3041..=0x33BF | 0x3400..=0x4DBF | 0x4E00..=0xA4CF | 0xA960..=0xA97C | 0xAC00..=0xD7A3 | 0xF900..=0xFAFF | 0xFE10..=0xFE19 | 0xFE30..=0xFE6F | 0xFF01..=0xFF60 | 0xFFE0..=0xFFE6 | 0x1F004
635 | 0x1F0CF
636 | 0x1F170..=0x1F171
637 | 0x1F17E..=0x1F17F
638 | 0x1F18E
639 | 0x1F191..=0x1F19A
640 | 0x1F1E0..=0x1F1FF | 0x1F200..=0x1F202
642 | 0x1F210..=0x1F23B
643 | 0x1F240..=0x1F248
644 | 0x1F250..=0x1F251
645 | 0x1F260..=0x1F265
646 | 0x1F300..=0x1F64F | 0x1F680..=0x1F6FF | 0x1F900..=0x1F9FF | 0x1FA00..=0x1FA6F
650 | 0x1FA70..=0x1FAFF
651 | 0x20000..=0x2FFFD | 0x30000..=0x3FFFD )
654}
655
656pub fn max_line_length_c(data: &[u8]) -> u64 {
666 let mut max_len: u64 = 0;
667 let mut line_len: u64 = 0; let mut linepos: u64 = 0; for &b in data {
671 match b {
672 b'\n' => {
673 if line_len > max_len {
674 max_len = line_len;
675 }
676 linepos = 0;
677 line_len = 0;
678 }
679 b'\t' => {
680 linepos = (linepos + 8) & !7;
681 if linepos > line_len {
682 line_len = linepos;
683 }
684 }
685 b'\r' => {
686 linepos = 0;
687 }
688 0x0C => {
689 if line_len > max_len {
691 max_len = line_len;
692 }
693 linepos = 0;
694 line_len = 0;
695 }
696 _ => {
697 if PRINTABLE_TABLE[b as usize] != 0 {
698 linepos += 1;
699 if linepos > line_len {
700 line_len = linepos;
701 }
702 }
703 }
705 }
706 }
707
708 if line_len > max_len {
710 max_len = line_len;
711 }
712
713 max_len
714}
715
716pub fn max_line_length_utf8(data: &[u8]) -> u64 {
721 let mut max_len: u64 = 0;
722 let mut line_len: u64 = 0;
723 let mut linepos: u64 = 0;
724 let mut i = 0;
725
726 while i < data.len() {
727 let b = data[i];
728
729 if b < 0x80 {
731 match b {
732 b'\n' => {
733 if line_len > max_len {
734 max_len = line_len;
735 }
736 linepos = 0;
737 line_len = 0;
738 }
739 b'\t' => {
740 linepos = (linepos + 8) & !7;
741 if linepos > line_len {
742 line_len = linepos;
743 }
744 }
745 b'\r' => {
746 linepos = 0;
747 }
748 0x0C => {
749 if line_len > max_len {
751 max_len = line_len;
752 }
753 linepos = 0;
754 line_len = 0;
755 }
756 0x20..=0x7E => {
757 linepos += 1;
759 if linepos > line_len {
760 line_len = linepos;
761 }
762 }
763 _ => {
764 }
766 }
767 i += 1;
768 } else {
769 let (cp, len) = decode_utf8(&data[i..]);
771
772 if cp <= 0x9F {
774 } else if is_wide_char(cp) {
776 linepos += 2;
777 if linepos > line_len {
778 line_len = linepos;
779 }
780 } else {
781 linepos += 1;
783 if linepos > line_len {
784 line_len = linepos;
785 }
786 }
787 i += len;
788 }
789 }
790
791 if line_len > max_len {
793 max_len = line_len;
794 }
795
796 max_len
797}
798
799#[inline]
801pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
802 if utf8 {
803 max_line_length_utf8(data)
804 } else {
805 max_line_length_c(data)
806 }
807}
808
809pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
821 if utf8 {
822 let (lines, words) = count_lines_words_utf8_fused(data);
823 WcCounts {
824 lines,
825 words,
826 bytes: data.len() as u64,
827 chars: count_chars_utf8(data),
828 max_line_length: max_line_length_utf8(data),
829 }
830 } else {
831 WcCounts {
832 lines: count_lines(data),
833 words: count_words_locale(data, false),
834 bytes: data.len() as u64,
835 chars: data.len() as u64,
836 max_line_length: max_line_length_c(data),
837 }
838 }
839}
840
841pub fn count_lines_parallel(data: &[u8]) -> u64 {
847 if data.len() < PARALLEL_THRESHOLD {
848 return count_lines(data);
849 }
850
851 let num_threads = rayon::current_num_threads().max(1);
852 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
853
854 data.par_chunks(chunk_size)
855 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
856 .sum()
857}
858
859pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
861 if utf8 || data.len() < PARALLEL_THRESHOLD {
862 return count_words_locale(data, utf8);
865 }
866
867 let num_threads = rayon::current_num_threads().max(1);
869 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
870
871 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
872
873 let results: Vec<(u64, bool, bool)> = chunks
875 .par_iter()
876 .map(|chunk| count_words_c_chunk(chunk))
877 .collect();
878
879 let mut total = 0u64;
880 for i in 0..results.len() {
881 total += results[i].0;
882 if i > 0 && results[i - 1].2 && results[i].1 {
886 total -= 1;
887 }
888 }
889 total
890}
891
892pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
894 if !utf8 {
895 return data.len() as u64;
896 }
897 if data.len() < PARALLEL_THRESHOLD {
898 return count_chars_utf8(data);
899 }
900
901 let num_threads = rayon::current_num_threads().max(1);
902 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
903
904 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
905}
906
907pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
910 let (lines, words) = count_lines_words(data, utf8);
911 (lines, words, data.len() as u64)
912}
913
914pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
917 if data.len() < PARALLEL_THRESHOLD {
918 return count_lwb(data, utf8);
920 }
921
922 let (lines, words) = if utf8 {
925 count_lines_words_utf8_fused(data)
926 } else {
927 let num_threads = rayon::current_num_threads().max(1);
929 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
930
931 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
932 let results: Vec<(u64, bool, bool)> = chunks
933 .par_iter()
934 .map(|chunk| count_words_c_chunk(chunk))
935 .collect();
936
937 let mut word_total = 0u64;
938 for i in 0..results.len() {
939 word_total += results[i].0;
940 if i > 0 && results[i - 1].2 && results[i].1 {
941 word_total -= 1;
942 }
943 }
944
945 let line_total: u64 = data
946 .par_chunks(chunk_size)
947 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
948 .sum();
949
950 (line_total, word_total)
951 };
952
953 (lines, words, data.len() as u64)
954}
955
956pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
958 if data.len() < PARALLEL_THRESHOLD {
959 let lines = count_lines(data);
960 let words = count_words_locale(data, utf8);
961 let chars = count_chars(data, utf8);
962 return (lines, words, chars);
963 }
964
965 let words = count_words_parallel(data, utf8);
967
968 let num_threads = rayon::current_num_threads().max(1);
970 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
971
972 let lines: u64 = data
973 .par_chunks(chunk_size)
974 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
975 .sum();
976
977 let chars = if utf8 {
978 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
979 } else {
980 data.len() as u64
981 };
982
983 (lines, words, chars)
984}