1use memchr::memchr_iter;
2use rayon::prelude::*;
3
4const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8#[derive(Debug, Clone, Default, PartialEq, Eq)]
10pub struct WcCounts {
11 pub lines: u64,
12 pub words: u64,
13 pub bytes: u64,
14 pub chars: u64,
15 pub max_line_length: u64,
16}
17
18const fn make_byte_class_c() -> [u8; 256] {
35 let mut t = [2u8; 256]; t[0x09] = 1; t[0x0A] = 1; t[0x0B] = 1; t[0x0C] = 1; t[0x0D] = 1; t[0x20] = 1; t[0x00] = 0;
47 let mut i = 0x21u16;
49 while i <= 0x7E {
50 t[i as usize] = 0;
51 i += 1;
52 }
53 t
54}
55
56const BYTE_CLASS_C: [u8; 256] = make_byte_class_c();
57
58const fn make_byte_class_utf8() -> [u8; 256] {
61 let mut t = [2u8; 256]; t[0x09] = 1; t[0x0A] = 1; t[0x0B] = 1; t[0x0C] = 1; t[0x0D] = 1; t[0x20] = 1; let mut i = 0x21u16;
71 while i <= 0x7E {
72 t[i as usize] = 0;
73 i += 1;
74 }
75 t
76}
77
78const BYTE_CLASS_UTF8: [u8; 256] = make_byte_class_utf8();
79
80const fn make_printable_table() -> [u8; 256] {
82 let mut t = [0u8; 256];
83 let mut i = 0x20u16;
84 while i <= 0x7E {
85 t[i as usize] = 1;
86 i += 1;
87 }
88 t
89}
90
91const PRINTABLE_TABLE: [u8; 256] = make_printable_table();
92
93#[inline]
100fn is_unicode_space(cp: u32) -> bool {
101 matches!(
102 cp,
103 0x00A0 | 0x1680 | 0x2000
106 ..=0x200A | 0x2028 | 0x2029 | 0x202F | 0x205F | 0x3000 )
113}
114
115#[inline]
119fn is_unicode_printable(cp: u32) -> bool {
120 cp >= 0xA0
121}
122
123#[inline]
130pub fn count_lines(data: &[u8]) -> u64 {
131 memchr_iter(b'\n', data).count() as u64
132}
133
134#[inline]
136pub fn count_bytes(data: &[u8]) -> u64 {
137 data.len() as u64
138}
139
140pub fn count_words(data: &[u8]) -> u64 {
142 count_words_locale(data, true)
143}
144
145pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
152 if utf8 {
153 count_words_utf8(data)
154 } else {
155 count_words_c(data)
156 }
157}
158
159fn count_words_c(data: &[u8]) -> u64 {
163 let mut words = 0u64;
164 let mut in_word = false;
165 for &b in data {
166 let class = BYTE_CLASS_C[b as usize];
167 if class == 1 {
168 in_word = false;
170 } else if class == 0 {
171 if !in_word {
173 in_word = true;
174 words += 1;
175 }
176 }
177 }
179 words
180}
181
182fn count_words_c_chunk(data: &[u8]) -> (u64, bool, bool) {
186 let mut words = 0u64;
187 let mut in_word = false;
188 let mut first_active_is_printable = false;
189 let mut seen_active = false;
190
191 for &b in data {
192 let class = BYTE_CLASS_C[b as usize];
193 if class == 1 {
194 if !seen_active {
195 seen_active = true;
196 }
198 in_word = false;
199 } else if class == 0 {
200 if !seen_active {
201 seen_active = true;
202 first_active_is_printable = true;
203 }
204 if !in_word {
205 in_word = true;
206 words += 1;
207 }
208 }
209 }
210 (words, first_active_is_printable, in_word)
211}
212
213fn count_words_utf8(data: &[u8]) -> u64 {
222 let mut words = 0u64;
223 let mut in_word = false;
224 let mut i = 0;
225
226 while i < data.len() {
227 let b = data[i];
228
229 if b < 0x80 {
230 let class = BYTE_CLASS_UTF8[b as usize];
232 if class == 1 {
233 in_word = false;
234 } else if class == 0 {
235 if !in_word {
236 in_word = true;
237 words += 1;
238 }
239 }
240 i += 1;
242 } else if b < 0xC2 {
243 i += 1;
247 } else if b < 0xE0 {
248 if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
250 let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
251 if is_unicode_space(cp) {
252 in_word = false;
253 } else if is_unicode_printable(cp) {
254 if !in_word {
255 in_word = true;
256 words += 1;
257 }
258 }
259 i += 2;
261 } else {
262 i += 1;
264 }
265 } else if b < 0xF0 {
266 if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
268 let cp = ((b as u32 & 0x0F) << 12)
269 | ((data[i + 1] as u32 & 0x3F) << 6)
270 | (data[i + 2] as u32 & 0x3F);
271 if is_unicode_space(cp) {
272 in_word = false;
273 } else if is_unicode_printable(cp) {
274 if !in_word {
275 in_word = true;
276 words += 1;
277 }
278 }
279 i += 3;
280 } else {
281 i += 1;
283 }
284 } else if b < 0xF5 {
285 if i + 3 < data.len()
287 && (data[i + 1] & 0xC0) == 0x80
288 && (data[i + 2] & 0xC0) == 0x80
289 && (data[i + 3] & 0xC0) == 0x80
290 {
291 let cp = ((b as u32 & 0x07) << 18)
292 | ((data[i + 1] as u32 & 0x3F) << 12)
293 | ((data[i + 2] as u32 & 0x3F) << 6)
294 | (data[i + 3] as u32 & 0x3F);
295 if is_unicode_space(cp) {
296 in_word = false;
297 } else if is_unicode_printable(cp) {
298 if !in_word {
299 in_word = true;
300 words += 1;
301 }
302 }
303 i += 4;
304 } else {
305 i += 1;
307 }
308 } else {
309 i += 1;
311 }
312 }
313
314 words
315}
316
317pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
321 if utf8 {
322 count_lines_words_utf8_fused(data)
323 } else {
324 let mut lines = 0u64;
325 let mut words = 0u64;
326 let mut in_word = false;
327 for &b in data {
328 if b == b'\n' {
329 lines += 1;
330 }
331 let class = BYTE_CLASS_C[b as usize];
332 if class == 1 {
333 in_word = false;
334 } else if class == 0 {
335 if !in_word {
336 in_word = true;
337 words += 1;
338 }
339 }
340 }
341 (lines, words)
342 }
343}
344
345fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
348 let mut lines = 0u64;
349 let mut words = 0u64;
350 let mut in_word = false;
351 let mut i = 0;
352
353 while i < data.len() {
354 let b = data[i];
355
356 if b < 0x80 {
357 if b == b'\n' {
359 lines += 1;
360 in_word = false;
361 } else {
362 let class = BYTE_CLASS_UTF8[b as usize];
363 if class == 1 {
364 in_word = false;
365 } else if class == 0 {
366 if !in_word {
367 in_word = true;
368 words += 1;
369 }
370 }
371 }
372 i += 1;
373 } else if b < 0xC2 {
374 i += 1;
375 } else if b < 0xE0 {
376 if i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
377 let cp = ((b as u32 & 0x1F) << 6) | (data[i + 1] as u32 & 0x3F);
378 if is_unicode_space(cp) {
379 in_word = false;
380 } else if is_unicode_printable(cp) {
381 if !in_word {
382 in_word = true;
383 words += 1;
384 }
385 }
386 i += 2;
387 } else {
388 i += 1;
389 }
390 } else if b < 0xF0 {
391 if i + 2 < data.len() && (data[i + 1] & 0xC0) == 0x80 && (data[i + 2] & 0xC0) == 0x80 {
392 let cp = ((b as u32 & 0x0F) << 12)
393 | ((data[i + 1] as u32 & 0x3F) << 6)
394 | (data[i + 2] as u32 & 0x3F);
395 if is_unicode_space(cp) {
396 in_word = false;
397 } else if is_unicode_printable(cp) {
398 if !in_word {
399 in_word = true;
400 words += 1;
401 }
402 }
403 i += 3;
404 } else {
405 i += 1;
406 }
407 } else if b < 0xF5 {
408 if i + 3 < data.len()
409 && (data[i + 1] & 0xC0) == 0x80
410 && (data[i + 2] & 0xC0) == 0x80
411 && (data[i + 3] & 0xC0) == 0x80
412 {
413 let cp = ((b as u32 & 0x07) << 18)
414 | ((data[i + 1] as u32 & 0x3F) << 12)
415 | ((data[i + 2] as u32 & 0x3F) << 6)
416 | (data[i + 3] as u32 & 0x3F);
417 if is_unicode_space(cp) {
418 in_word = false;
419 } else if is_unicode_printable(cp) {
420 if !in_word {
421 in_word = true;
422 words += 1;
423 }
424 }
425 i += 4;
426 } else {
427 i += 1;
428 }
429 } else {
430 i += 1;
431 }
432 }
433
434 (lines, words)
435}
436
437pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
439 if utf8 {
440 let (lines, words) = count_lines_words_utf8_fused(data);
442 let chars = count_chars_utf8(data);
443 (lines, words, chars)
444 } else {
445 let mut lines = 0u64;
447 let mut words = 0u64;
448 let mut in_word = false;
449 for &b in data {
450 if b == b'\n' {
451 lines += 1;
452 }
453 let class = BYTE_CLASS_C[b as usize];
454 if class == 1 {
455 in_word = false;
456 } else if class == 0 {
457 if !in_word {
458 in_word = true;
459 words += 1;
460 }
461 }
462 }
463 (lines, words, data.len() as u64)
464 }
465}
466
467pub fn count_chars_utf8(data: &[u8]) -> u64 {
473 let mut count = 0u64;
474 let chunks = data.chunks_exact(64);
475 let remainder = chunks.remainder();
476
477 for chunk in chunks {
478 let mut char_mask = 0u64;
480 let mut i = 0;
481 while i + 7 < 64 {
482 char_mask |= (((chunk[i] & 0xC0) != 0x80) as u64) << i;
483 char_mask |= (((chunk[i + 1] & 0xC0) != 0x80) as u64) << (i + 1);
484 char_mask |= (((chunk[i + 2] & 0xC0) != 0x80) as u64) << (i + 2);
485 char_mask |= (((chunk[i + 3] & 0xC0) != 0x80) as u64) << (i + 3);
486 char_mask |= (((chunk[i + 4] & 0xC0) != 0x80) as u64) << (i + 4);
487 char_mask |= (((chunk[i + 5] & 0xC0) != 0x80) as u64) << (i + 5);
488 char_mask |= (((chunk[i + 6] & 0xC0) != 0x80) as u64) << (i + 6);
489 char_mask |= (((chunk[i + 7] & 0xC0) != 0x80) as u64) << (i + 7);
490 i += 8;
491 }
492 count += char_mask.count_ones() as u64;
493 }
494
495 for &b in remainder {
496 count += ((b & 0xC0) != 0x80) as u64;
497 }
498 count
499}
500
501#[inline]
503pub fn count_chars_c(data: &[u8]) -> u64 {
504 data.len() as u64
505}
506
507#[inline]
509pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
510 if utf8 {
511 count_chars_utf8(data)
512 } else {
513 count_chars_c(data)
514 }
515}
516
517pub fn is_utf8_locale() -> bool {
519 for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
520 if let Ok(val) = std::env::var(var) {
521 if !val.is_empty() {
522 let lower = val.to_ascii_lowercase();
523 return lower.contains("utf-8") || lower.contains("utf8");
524 }
525 }
526 }
527 false
528}
529
530#[inline]
533fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
534 let b0 = bytes[0];
535 if b0 < 0x80 {
536 return (b0 as u32, 1);
537 }
538 if b0 < 0xC2 {
539 return (b0 as u32, 1);
541 }
542 if b0 < 0xE0 {
543 if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
544 return (b0 as u32, 1);
545 }
546 let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
547 return (cp, 2);
548 }
549 if b0 < 0xF0 {
550 if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
551 return (b0 as u32, 1);
552 }
553 let cp =
554 ((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
555 return (cp, 3);
556 }
557 if b0 < 0xF5 {
558 if bytes.len() < 4
559 || bytes[1] & 0xC0 != 0x80
560 || bytes[2] & 0xC0 != 0x80
561 || bytes[3] & 0xC0 != 0x80
562 {
563 return (b0 as u32, 1);
564 }
565 let cp = ((b0 as u32 & 0x07) << 18)
566 | ((bytes[1] as u32 & 0x3F) << 12)
567 | ((bytes[2] as u32 & 0x3F) << 6)
568 | (bytes[3] as u32 & 0x3F);
569 return (cp, 4);
570 }
571 (b0 as u32, 1)
572}
573
574#[inline]
577fn is_zero_width(cp: u32) -> bool {
578 matches!(
579 cp,
580 0x0300..=0x036F | 0x0483..=0x0489 | 0x0591..=0x05BD | 0x05BF
584 | 0x05C1..=0x05C2
585 | 0x05C4..=0x05C5
586 | 0x05C7
587 | 0x0600..=0x0605 | 0x0610..=0x061A | 0x064B..=0x065F | 0x0670
591 | 0x06D6..=0x06DD
592 | 0x06DF..=0x06E4
593 | 0x06E7..=0x06E8
594 | 0x06EA..=0x06ED
595 | 0x070F
596 | 0x0711
597 | 0x0730..=0x074A
598 | 0x07A6..=0x07B0
599 | 0x07EB..=0x07F3
600 | 0x07FD
601 | 0x0816..=0x0819
602 | 0x081B..=0x0823
603 | 0x0825..=0x0827
604 | 0x0829..=0x082D
605 | 0x0859..=0x085B
606 | 0x08D3..=0x08E1
607 | 0x08E3..=0x0902
608 | 0x093A
609 | 0x093C
610 | 0x0941..=0x0948
611 | 0x094D
612 | 0x0951..=0x0957
613 | 0x0962..=0x0963
614 | 0x0981
615 | 0x09BC
616 | 0x09C1..=0x09C4
617 | 0x09CD
618 | 0x09E2..=0x09E3
619 | 0x09FE
620 | 0x0A01..=0x0A02
621 | 0x0A3C
622 | 0x0A41..=0x0A42
623 | 0x0A47..=0x0A48
624 | 0x0A4B..=0x0A4D
625 | 0x0A51
626 | 0x0A70..=0x0A71
627 | 0x0A75
628 | 0x0A81..=0x0A82
629 | 0x0ABC
630 | 0x0AC1..=0x0AC5
631 | 0x0AC7..=0x0AC8
632 | 0x0ACD
633 | 0x0AE2..=0x0AE3
634 | 0x0AFA..=0x0AFF
635 | 0x0B01
636 | 0x0B3C
637 | 0x0B3F
638 | 0x0B41..=0x0B44
639 | 0x0B4D
640 | 0x0B56
641 | 0x0B62..=0x0B63
642 | 0x0B82
643 | 0x0BC0
644 | 0x0BCD
645 | 0x0C00
646 | 0x0C04
647 | 0x0C3E..=0x0C40
648 | 0x0C46..=0x0C48
649 | 0x0C4A..=0x0C4D
650 | 0x0C55..=0x0C56
651 | 0x0C62..=0x0C63
652 | 0x0C81
653 | 0x0CBC
654 | 0x0CBF
655 | 0x0CC6
656 | 0x0CCC..=0x0CCD
657 | 0x0CE2..=0x0CE3
658 | 0x0D00..=0x0D01
659 | 0x0D3B..=0x0D3C
660 | 0x0D41..=0x0D44
661 | 0x0D4D
662 | 0x0D62..=0x0D63
663 | 0x0DCA
664 | 0x0DD2..=0x0DD4
665 | 0x0DD6
666 | 0x0E31
667 | 0x0E34..=0x0E3A
668 | 0x0E47..=0x0E4E
669 | 0x0EB1
670 | 0x0EB4..=0x0EBC
671 | 0x0EC8..=0x0ECD
672 | 0x0F18..=0x0F19
673 | 0x0F35
674 | 0x0F37
675 | 0x0F39
676 | 0x0F71..=0x0F7E
677 | 0x0F80..=0x0F84
678 | 0x0F86..=0x0F87
679 | 0x0F8D..=0x0F97
680 | 0x0F99..=0x0FBC
681 | 0x0FC6
682 | 0x102D..=0x1030
683 | 0x1032..=0x1037
684 | 0x1039..=0x103A
685 | 0x103D..=0x103E
686 | 0x1058..=0x1059
687 | 0x105E..=0x1060
688 | 0x1071..=0x1074
689 | 0x1082
690 | 0x1085..=0x1086
691 | 0x108D
692 | 0x109D
693 | 0x1160..=0x11FF | 0x135D..=0x135F
695 | 0x1712..=0x1714
696 | 0x1732..=0x1734
697 | 0x1752..=0x1753
698 | 0x1772..=0x1773
699 | 0x17B4..=0x17B5
700 | 0x17B7..=0x17BD
701 | 0x17C6
702 | 0x17C9..=0x17D3
703 | 0x17DD
704 | 0x180B..=0x180D
705 | 0x1885..=0x1886
706 | 0x18A9
707 | 0x1920..=0x1922
708 | 0x1927..=0x1928
709 | 0x1932
710 | 0x1939..=0x193B
711 | 0x1A17..=0x1A18
712 | 0x1A1B
713 | 0x1A56
714 | 0x1A58..=0x1A5E
715 | 0x1A60
716 | 0x1A62
717 | 0x1A65..=0x1A6C
718 | 0x1A73..=0x1A7C
719 | 0x1A7F
720 | 0x1AB0..=0x1ABE
721 | 0x1B00..=0x1B03
722 | 0x1B34
723 | 0x1B36..=0x1B3A
724 | 0x1B3C
725 | 0x1B42
726 | 0x1B6B..=0x1B73
727 | 0x1B80..=0x1B81
728 | 0x1BA2..=0x1BA5
729 | 0x1BA8..=0x1BA9
730 | 0x1BAB..=0x1BAD
731 | 0x1BE6
732 | 0x1BE8..=0x1BE9
733 | 0x1BED
734 | 0x1BEF..=0x1BF1
735 | 0x1C2C..=0x1C33
736 | 0x1C36..=0x1C37
737 | 0x1CD0..=0x1CD2
738 | 0x1CD4..=0x1CE0
739 | 0x1CE2..=0x1CE8
740 | 0x1CED
741 | 0x1CF4
742 | 0x1CF8..=0x1CF9
743 | 0x1DC0..=0x1DF9
744 | 0x1DFB..=0x1DFF
745 | 0x200B..=0x200F | 0x202A..=0x202E | 0x2060..=0x2064 | 0x2066..=0x206F | 0x20D0..=0x20F0 | 0xFE00..=0xFE0F | 0xFE20..=0xFE2F | 0xFEFF | 0xFFF9..=0xFFFB | 0x1D167..=0x1D169
755 | 0x1D173..=0x1D182
756 | 0x1D185..=0x1D18B
757 | 0x1D1AA..=0x1D1AD
758 | 0x1D242..=0x1D244
759 | 0xE0001
760 | 0xE0020..=0xE007F
761 | 0xE0100..=0xE01EF )
763}
764
765#[inline]
768fn is_wide_char(cp: u32) -> bool {
769 matches!(
770 cp,
771 0x1100..=0x115F | 0x231A..=0x231B | 0x2329..=0x232A | 0x23E9..=0x23F3 | 0x23F8..=0x23FA
776 | 0x25FD..=0x25FE
777 | 0x2614..=0x2615
778 | 0x2648..=0x2653
779 | 0x267F
780 | 0x2693
781 | 0x26A1
782 | 0x26AA..=0x26AB
783 | 0x26BD..=0x26BE
784 | 0x26C4..=0x26C5
785 | 0x26CE
786 | 0x26D4
787 | 0x26EA
788 | 0x26F2..=0x26F3
789 | 0x26F5
790 | 0x26FA
791 | 0x26FD
792 | 0x2702
793 | 0x2705
794 | 0x2708..=0x270D
795 | 0x270F
796 | 0x2712
797 | 0x2714
798 | 0x2716
799 | 0x271D
800 | 0x2721
801 | 0x2728
802 | 0x2733..=0x2734
803 | 0x2744
804 | 0x2747
805 | 0x274C
806 | 0x274E
807 | 0x2753..=0x2755
808 | 0x2757
809 | 0x2763..=0x2764
810 | 0x2795..=0x2797
811 | 0x27A1
812 | 0x27B0
813 | 0x27BF
814 | 0x2934..=0x2935
815 | 0x2B05..=0x2B07
816 | 0x2B1B..=0x2B1C
817 | 0x2B50
818 | 0x2B55
819 | 0x2E80..=0x303E | 0x3040..=0x33BF | 0x3400..=0x4DBF | 0x4E00..=0xA4CF | 0xA960..=0xA97C | 0xAC00..=0xD7A3 | 0xF900..=0xFAFF | 0xFE10..=0xFE19 | 0xFE30..=0xFE6F | 0xFF01..=0xFF60 | 0xFFE0..=0xFFE6 | 0x1F004
831 | 0x1F0CF
832 | 0x1F170..=0x1F171
833 | 0x1F17E..=0x1F17F
834 | 0x1F18E
835 | 0x1F191..=0x1F19A
836 | 0x1F1E0..=0x1F1FF | 0x1F200..=0x1F202
838 | 0x1F210..=0x1F23B
839 | 0x1F240..=0x1F248
840 | 0x1F250..=0x1F251
841 | 0x1F260..=0x1F265
842 | 0x1F300..=0x1F64F | 0x1F680..=0x1F6FF | 0x1F900..=0x1F9FF | 0x1FA00..=0x1FA6F
846 | 0x1FA70..=0x1FAFF
847 | 0x20000..=0x2FFFD | 0x30000..=0x3FFFD )
850}
851
852pub fn max_line_length_c(data: &[u8]) -> u64 {
862 let mut max_len: u64 = 0;
863 let mut line_len: u64 = 0; let mut linepos: u64 = 0; for &b in data {
867 match b {
868 b'\n' => {
869 if line_len > max_len {
870 max_len = line_len;
871 }
872 linepos = 0;
873 line_len = 0;
874 }
875 b'\t' => {
876 linepos = (linepos + 8) & !7;
877 if linepos > line_len {
878 line_len = linepos;
879 }
880 }
881 b'\r' => {
882 linepos = 0;
883 }
884 0x0C => {
885 if line_len > max_len {
887 max_len = line_len;
888 }
889 linepos = 0;
890 line_len = 0;
891 }
892 _ => {
893 if PRINTABLE_TABLE[b as usize] != 0 {
894 linepos += 1;
895 if linepos > line_len {
896 line_len = linepos;
897 }
898 }
899 }
901 }
902 }
903
904 if line_len > max_len {
906 max_len = line_len;
907 }
908
909 max_len
910}
911
912pub fn max_line_length_utf8(data: &[u8]) -> u64 {
917 let mut max_len: u64 = 0;
918 let mut line_len: u64 = 0;
919 let mut linepos: u64 = 0;
920 let mut i = 0;
921
922 while i < data.len() {
923 let b = data[i];
924
925 if b < 0x80 {
927 match b {
928 b'\n' => {
929 if line_len > max_len {
930 max_len = line_len;
931 }
932 linepos = 0;
933 line_len = 0;
934 }
935 b'\t' => {
936 linepos = (linepos + 8) & !7;
937 if linepos > line_len {
938 line_len = linepos;
939 }
940 }
941 b'\r' => {
942 linepos = 0;
943 }
944 0x0C => {
945 if line_len > max_len {
947 max_len = line_len;
948 }
949 linepos = 0;
950 line_len = 0;
951 }
952 0x20..=0x7E => {
953 linepos += 1;
955 if linepos > line_len {
956 line_len = linepos;
957 }
958 }
959 _ => {
960 }
962 }
963 i += 1;
964 } else {
965 let (cp, len) = decode_utf8(&data[i..]);
967
968 if cp <= 0x9F {
970 } else if is_zero_width(cp) {
972 } else if is_wide_char(cp) {
974 linepos += 2;
975 if linepos > line_len {
976 line_len = linepos;
977 }
978 } else {
979 linepos += 1;
981 if linepos > line_len {
982 line_len = linepos;
983 }
984 }
985 i += len;
986 }
987 }
988
989 if line_len > max_len {
991 max_len = line_len;
992 }
993
994 max_len
995}
996
997#[inline]
999pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
1000 if utf8 {
1001 max_line_length_utf8(data)
1002 } else {
1003 max_line_length_c(data)
1004 }
1005}
1006
1007pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
1019 if utf8 {
1020 let (lines, words) = count_lines_words_utf8_fused(data);
1021 WcCounts {
1022 lines,
1023 words,
1024 bytes: data.len() as u64,
1025 chars: count_chars_utf8(data),
1026 max_line_length: max_line_length_utf8(data),
1027 }
1028 } else {
1029 WcCounts {
1030 lines: count_lines(data),
1031 words: count_words_locale(data, false),
1032 bytes: data.len() as u64,
1033 chars: data.len() as u64,
1034 max_line_length: max_line_length_c(data),
1035 }
1036 }
1037}
1038
1039pub fn count_lines_parallel(data: &[u8]) -> u64 {
1045 if data.len() < PARALLEL_THRESHOLD {
1046 return count_lines(data);
1047 }
1048
1049 let num_threads = rayon::current_num_threads().max(1);
1050 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1051
1052 data.par_chunks(chunk_size)
1053 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1054 .sum()
1055}
1056
1057pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
1059 if utf8 || data.len() < PARALLEL_THRESHOLD {
1060 return count_words_locale(data, utf8);
1063 }
1064
1065 let num_threads = rayon::current_num_threads().max(1);
1067 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1068
1069 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1070
1071 let results: Vec<(u64, bool, bool)> = chunks
1073 .par_iter()
1074 .map(|chunk| count_words_c_chunk(chunk))
1075 .collect();
1076
1077 let mut total = 0u64;
1078 for i in 0..results.len() {
1079 total += results[i].0;
1080 if i > 0 && results[i - 1].2 && results[i].1 {
1084 total -= 1;
1085 }
1086 }
1087 total
1088}
1089
1090pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
1092 if !utf8 {
1093 return data.len() as u64;
1094 }
1095 if data.len() < PARALLEL_THRESHOLD {
1096 return count_chars_utf8(data);
1097 }
1098
1099 let num_threads = rayon::current_num_threads().max(1);
1100 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1101
1102 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1103}
1104
1105pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1108 let (lines, words) = count_lines_words(data, utf8);
1109 (lines, words, data.len() as u64)
1110}
1111
1112pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1115 if data.len() < PARALLEL_THRESHOLD {
1116 return count_lwb(data, utf8);
1118 }
1119
1120 let (lines, words) = if utf8 {
1123 count_lines_words_utf8_fused(data)
1124 } else {
1125 let num_threads = rayon::current_num_threads().max(1);
1127 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1128
1129 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
1130 let results: Vec<(u64, bool, bool)> = chunks
1131 .par_iter()
1132 .map(|chunk| count_words_c_chunk(chunk))
1133 .collect();
1134
1135 let mut word_total = 0u64;
1136 for i in 0..results.len() {
1137 word_total += results[i].0;
1138 if i > 0 && results[i - 1].2 && results[i].1 {
1139 word_total -= 1;
1140 }
1141 }
1142
1143 let line_total: u64 = data
1144 .par_chunks(chunk_size)
1145 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1146 .sum();
1147
1148 (line_total, word_total)
1149 };
1150
1151 (lines, words, data.len() as u64)
1152}
1153
1154pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
1156 if data.len() < PARALLEL_THRESHOLD {
1157 let lines = count_lines(data);
1158 let words = count_words_locale(data, utf8);
1159 let chars = count_chars(data, utf8);
1160 return (lines, words, chars);
1161 }
1162
1163 let words = count_words_parallel(data, utf8);
1165
1166 let num_threads = rayon::current_num_threads().max(1);
1168 let chunk_size = (data.len() / num_threads).max(1024 * 1024);
1169
1170 let lines: u64 = data
1171 .par_chunks(chunk_size)
1172 .map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
1173 .sum();
1174
1175 let chars = if utf8 {
1176 data.par_chunks(chunk_size).map(count_chars_utf8).sum()
1177 } else {
1178 data.len() as u64
1179 };
1180
1181 (lines, words, chars)
1182}