use memchr::memchr_iter;
use rayon::prelude::*;
const PARALLEL_THRESHOLD: usize = 1024 * 1024;
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct WcCounts {
pub lines: u64,
pub words: u64,
pub bytes: u64,
pub chars: u64,
pub max_line_length: u64,
}
const fn make_is_space() -> [bool; 256] {
let mut t = [false; 256];
t[0x09] = true; t[0x0A] = true; t[0x0B] = true; t[0x0C] = true; t[0x0D] = true; t[0x20] = true; t
}
const IS_SPACE: [bool; 256] = make_is_space();
const fn make_is_print() -> [bool; 256] {
let mut t = [false; 256];
let mut i = 0x20u16;
while i <= 0x7E {
t[i as usize] = true;
i += 1;
}
t
}
const IS_PRINT: [bool; 256] = make_is_print();
#[inline]
pub(crate) fn first_is_word(data: &[u8]) -> bool {
!data.is_empty() && IS_PRINT[data[0] as usize] && !IS_SPACE[data[0] as usize]
}
#[inline]
fn is_unicode_space(cp: u32) -> bool {
matches!(
cp,
0x1680 | 0x2000
..=0x200A | 0x2028 | 0x2029 | 0x205F | 0x3000 )
}
#[inline]
fn is_wnbspace(cp: u32) -> bool {
matches!(cp, 0x00A0 | 0x2007 | 0x202F | 0x2060)
}
#[inline]
fn is_unicode_word_break(cp: u32) -> bool {
is_unicode_space(cp) || is_wnbspace(cp)
}
#[inline]
fn is_printable_unicode(cp: u32) -> bool {
if cp < 0xA0 {
return false;
}
if (0xD800..=0xDFFF).contains(&cp) || cp > 0x10FFFF {
return false;
}
if (0xFDD0..=0xFDEF).contains(&cp) || (cp & 0xFFFE) == 0xFFFE {
return false;
}
true
}
#[inline]
pub fn count_lines(data: &[u8]) -> u64 {
memchr_iter(b'\n', data).count() as u64
}
#[inline]
pub fn count_bytes(data: &[u8]) -> u64 {
data.len() as u64
}
pub fn count_words(data: &[u8]) -> u64 {
count_words_locale(data, true)
}
pub fn count_words_locale(data: &[u8], utf8: bool) -> u64 {
if utf8 {
count_words_utf8(data)
} else {
count_words_c(data)
}
}
fn count_words_c(data: &[u8]) -> u64 {
let mut words = 0u64;
let mut in_word = false;
let mut i = 0;
let len = data.len();
while i < len {
let b = unsafe { *data.get_unchecked(i) };
if IS_SPACE[b as usize] {
in_word = false;
} else if IS_PRINT[b as usize] {
if !in_word {
in_word = true;
words += 1;
}
}
i += 1;
}
words
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn count_lw_c_scalar_tail(
ptr: *const u8,
mut i: usize,
len: usize,
mut total_lines: u64,
mut total_words: u64,
mut prev_in_word: bool,
data: &[u8],
) -> (u64, u64, bool, bool) {
while i < len {
let b = unsafe { *ptr.add(i) };
if IS_SPACE[b as usize] {
if b == b'\n' {
total_lines += 1;
}
prev_in_word = false;
} else if IS_PRINT[b as usize] && !prev_in_word {
total_words += 1;
prev_in_word = true;
}
i += 1;
}
let first_word = first_is_word(data);
(total_lines, total_words, first_word, prev_in_word)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn count_lw_c_chunk_avx2(data: &[u8]) -> (u64, u64, bool, bool) {
use std::arch::x86_64::*;
let len = data.len();
let ptr = data.as_ptr();
let mut i = 0usize;
let mut total_lines = 0u64;
let mut total_words = 0u64;
let mut prev_in_word = false;
unsafe {
let nl_byte = _mm256_set1_epi8(b'\n' as i8);
let zero = _mm256_setzero_si256();
let ones = _mm256_set1_epi8(1);
let const_0x09 = _mm256_set1_epi8(0x09u8 as i8);
let const_0x0d = _mm256_set1_epi8(0x0Du8 as i8);
let const_0x20 = _mm256_set1_epi8(0x20u8 as i8);
let const_0x21 = _mm256_set1_epi8(0x21u8 as i8);
let const_0x7e = _mm256_set1_epi8(0x7Eu8 as i8);
let mut line_acc = _mm256_setzero_si256();
let mut batch = 0u32;
while i + 32 <= len {
let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
let is_nl = _mm256_cmpeq_epi8(v, nl_byte);
line_acc = _mm256_add_epi8(line_acc, _mm256_and_si256(is_nl, ones));
let ge_09 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x09), v);
let le_0d = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x0d), v);
let in_tab_range = _mm256_and_si256(ge_09, le_0d);
let is_sp = _mm256_cmpeq_epi8(v, const_0x20);
let is_space = _mm256_or_si256(in_tab_range, is_sp);
let space_mask = _mm256_movemask_epi8(is_space) as u32;
let ge_21 = _mm256_cmpeq_epi8(_mm256_max_epu8(v, const_0x21), v);
let le_7e = _mm256_cmpeq_epi8(_mm256_min_epu8(v, const_0x7e), v);
let is_print = _mm256_and_si256(ge_21, le_7e);
let print_mask = _mm256_movemask_epi8(is_print) as u32;
let transparent_mask = !(space_mask | print_mask);
if transparent_mask == 0 {
let prev_space = (space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 };
let starts = print_mask & prev_space;
total_words += starts.count_ones() as u64;
prev_in_word = (print_mask >> 31) & 1 == 1;
} else {
let end = (i + 32).min(len);
for j in i..end {
let b = *ptr.add(j);
if IS_SPACE[b as usize] {
prev_in_word = false;
} else if IS_PRINT[b as usize] && !prev_in_word {
total_words += 1;
prev_in_word = true;
}
}
}
batch += 1;
if batch >= 255 {
let sad = _mm256_sad_epu8(line_acc, zero);
let hi = _mm256_extracti128_si256(sad, 1);
let lo = _mm256_castsi256_si128(sad);
let s = _mm_add_epi64(lo, hi);
let h64 = _mm_unpackhi_epi64(s, s);
let t = _mm_add_epi64(s, h64);
total_lines += _mm_cvtsi128_si64(t) as u64;
line_acc = _mm256_setzero_si256();
batch = 0;
}
i += 32;
}
if batch > 0 {
let sad = _mm256_sad_epu8(line_acc, zero);
let hi = _mm256_extracti128_si256(sad, 1);
let lo = _mm256_castsi256_si128(sad);
let s = _mm_add_epi64(lo, hi);
let h64 = _mm_unpackhi_epi64(s, s);
let t = _mm_add_epi64(s, h64);
total_lines += _mm_cvtsi128_si64(t) as u64;
}
}
count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
unsafe fn count_lw_c_chunk_sse2(data: &[u8]) -> (u64, u64, bool, bool) {
use std::arch::x86_64::*;
let len = data.len();
let ptr = data.as_ptr();
let mut i = 0usize;
let mut total_lines = 0u64;
let mut total_words = 0u64;
let mut prev_in_word = false;
unsafe {
let nl_byte = _mm_set1_epi8(b'\n' as i8);
let zero = _mm_setzero_si128();
let ones = _mm_set1_epi8(1);
let const_0x09 = _mm_set1_epi8(0x09u8 as i8);
let const_0x0d = _mm_set1_epi8(0x0Du8 as i8);
let const_0x20 = _mm_set1_epi8(0x20u8 as i8);
let const_0x21 = _mm_set1_epi8(0x21u8 as i8);
let const_0x7e = _mm_set1_epi8(0x7Eu8 as i8);
let mut line_acc = _mm_setzero_si128();
let mut batch = 0u32;
while i + 16 <= len {
let v = _mm_loadu_si128(ptr.add(i) as *const __m128i);
let is_nl = _mm_cmpeq_epi8(v, nl_byte);
line_acc = _mm_add_epi8(line_acc, _mm_and_si128(is_nl, ones));
let ge_09 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x09), v);
let le_0d = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x0d), v);
let in_tab_range = _mm_and_si128(ge_09, le_0d);
let is_sp = _mm_cmpeq_epi8(v, const_0x20);
let is_space = _mm_or_si128(in_tab_range, is_sp);
let space_mask = (_mm_movemask_epi8(is_space) as u32) & 0xFFFF;
let ge_21 = _mm_cmpeq_epi8(_mm_max_epu8(v, const_0x21), v);
let le_7e = _mm_cmpeq_epi8(_mm_min_epu8(v, const_0x7e), v);
let is_print = _mm_and_si128(ge_21, le_7e);
let print_mask = (_mm_movemask_epi8(is_print) as u32) & 0xFFFF;
let transparent_mask = !(space_mask | print_mask) & 0xFFFF;
if transparent_mask == 0 {
let prev_space =
((space_mask << 1) | if prev_in_word { 0u32 } else { 1u32 }) & 0xFFFF;
let starts = print_mask & prev_space;
total_words += starts.count_ones() as u64;
prev_in_word = (print_mask >> 15) & 1 == 1;
} else {
let end = (i + 16).min(len);
for j in i..end {
let b = *ptr.add(j);
if IS_SPACE[b as usize] {
prev_in_word = false;
} else if IS_PRINT[b as usize] && !prev_in_word {
total_words += 1;
prev_in_word = true;
}
}
}
batch += 1;
if batch >= 255 {
let sad = _mm_sad_epu8(line_acc, zero);
let hi = _mm_unpackhi_epi64(sad, sad);
let t = _mm_add_epi64(sad, hi);
total_lines += _mm_cvtsi128_si64(t) as u64;
line_acc = _mm_setzero_si128();
batch = 0;
}
i += 16;
}
if batch > 0 {
let sad = _mm_sad_epu8(line_acc, zero);
let hi = _mm_unpackhi_epi64(sad, sad);
let t = _mm_add_epi64(sad, hi);
total_lines += _mm_cvtsi128_si64(t) as u64;
}
}
count_lw_c_scalar_tail(ptr, i, len, total_lines, total_words, prev_in_word, data)
}
#[inline]
fn count_lw_c_chunk_fast(data: &[u8]) -> (u64, u64, bool, bool) {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") && data.len() >= 64 {
return unsafe { count_lw_c_chunk_avx2(data) };
}
if data.len() >= 32 {
return unsafe { count_lw_c_chunk_sse2(data) };
}
}
count_lw_c_chunk(data)
}
fn count_lw_c_chunk(data: &[u8]) -> (u64, u64, bool, bool) {
let mut lines = 0u64;
let mut words = 0u64;
let mut in_word = false;
let mut i = 0;
let len = data.len();
let first_word = first_is_word(data);
while i < len {
let b = unsafe { *data.get_unchecked(i) };
if IS_SPACE[b as usize] {
if b == b'\n' {
lines += 1;
}
in_word = false;
} else if IS_PRINT[b as usize] {
if !in_word {
in_word = true;
words += 1;
}
}
i += 1;
}
(lines, words, first_word, in_word)
}
fn count_words_utf8(data: &[u8]) -> u64 {
let mut words = 0u64;
let mut in_word = false;
let mut i = 0;
let len = data.len();
while i < len {
let b = unsafe { *data.get_unchecked(i) };
if b < 0x80 {
if IS_SPACE[b as usize] {
in_word = false;
} else if b >= 0x21 && b <= 0x7E {
if !in_word {
in_word = true;
words += 1;
}
}
i += 1;
} else if b < 0xC2 {
i += 1;
} else if b < 0xE0 {
if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
let cp = ((b as u32 & 0x1F) << 6)
| (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
if is_unicode_word_break(cp) {
in_word = false;
} else if is_printable_unicode(cp) {
if !in_word {
in_word = true;
words += 1;
}
}
i += 2;
} else {
i += 1;
}
} else if b < 0xF0 {
if i + 2 < len
&& (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
&& (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
{
let cp = ((b as u32 & 0x0F) << 12)
| ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
| (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
if is_unicode_word_break(cp) {
in_word = false;
} else if is_printable_unicode(cp) {
if !in_word {
in_word = true;
words += 1;
}
}
i += 3;
} else {
i += 1;
}
} else if b < 0xF5 {
if i + 3 < len
&& (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
&& (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
&& (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
{
let cp = ((b as u32 & 0x07) << 18)
| ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
| ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
| (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
if is_unicode_word_break(cp) {
in_word = false;
} else if is_printable_unicode(cp) {
if !in_word {
in_word = true;
words += 1;
}
}
i += 4;
} else {
i += 1;
}
} else {
i += 1;
}
}
words
}
pub fn count_lines_words(data: &[u8], utf8: bool) -> (u64, u64) {
if utf8 {
count_lines_words_utf8_fused(data)
} else {
let (lines, words, _, _) = count_lw_c_chunk_fast(data);
(lines, words)
}
}
fn count_lines_words_utf8_fused(data: &[u8]) -> (u64, u64) {
let mut lines = 0u64;
let mut words = 0u64;
let mut in_word = false;
let mut i = 0;
let len = data.len();
while i < len {
let b = unsafe { *data.get_unchecked(i) };
if b == b'\n' {
lines += 1;
in_word = false;
i += 1;
} else if b < 0x80 {
if IS_SPACE[b as usize] {
in_word = false;
} else if b >= 0x21 && b <= 0x7E {
if !in_word {
in_word = true;
words += 1;
}
}
i += 1;
} else if b < 0xC2 {
i += 1;
} else if b < 0xE0 {
if i + 1 < len && (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80 {
let cp = ((b as u32 & 0x1F) << 6)
| (unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F);
if is_unicode_word_break(cp) {
in_word = false;
} else if is_printable_unicode(cp) {
if !in_word {
in_word = true;
words += 1;
}
}
i += 2;
} else {
i += 1;
}
} else if b < 0xF0 {
if i + 2 < len
&& (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
&& (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
{
let cp = ((b as u32 & 0x0F) << 12)
| ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 6)
| (unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F);
if is_unicode_word_break(cp) {
in_word = false;
} else if is_printable_unicode(cp) {
if !in_word {
in_word = true;
words += 1;
}
}
i += 3;
} else {
i += 1;
}
} else if b < 0xF5 {
if i + 3 < len
&& (unsafe { *data.get_unchecked(i + 1) } & 0xC0) == 0x80
&& (unsafe { *data.get_unchecked(i + 2) } & 0xC0) == 0x80
&& (unsafe { *data.get_unchecked(i + 3) } & 0xC0) == 0x80
{
let cp = ((b as u32 & 0x07) << 18)
| ((unsafe { *data.get_unchecked(i + 1) } as u32 & 0x3F) << 12)
| ((unsafe { *data.get_unchecked(i + 2) } as u32 & 0x3F) << 6)
| (unsafe { *data.get_unchecked(i + 3) } as u32 & 0x3F);
if is_unicode_word_break(cp) {
in_word = false;
} else if is_printable_unicode(cp) {
if !in_word {
in_word = true;
words += 1;
}
}
i += 4;
} else {
i += 1;
}
} else {
i += 1;
}
}
(lines, words)
}
pub fn count_lines_words_chars(data: &[u8], utf8: bool) -> (u64, u64, u64) {
if utf8 {
let (lines, words) = count_lines_words_utf8_fused(data);
let chars = count_chars_utf8(data);
(lines, words, chars)
} else {
let (lines, words) = count_lines_words(data, false);
(lines, words, data.len() as u64)
}
}
pub fn count_chars_utf8(data: &[u8]) -> u64 {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { count_chars_utf8_avx2(data) };
}
}
count_chars_utf8_scalar(data)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn count_chars_utf8_avx2(data: &[u8]) -> u64 {
unsafe {
use std::arch::x86_64::*;
let mask_c0 = _mm256_set1_epi8(0xC0u8 as i8);
let val_80 = _mm256_set1_epi8(0x80u8 as i8);
let ones = _mm256_set1_epi8(1);
let zero = _mm256_setzero_si256();
let mut total = 0u64;
let len = data.len();
let ptr = data.as_ptr();
let mut i = 0;
let mut acc = _mm256_setzero_si256();
let mut batch = 0u32;
while i + 32 <= len {
let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
let masked = _mm256_and_si256(v, mask_c0);
let is_cont = _mm256_cmpeq_epi8(masked, val_80);
let non_cont = _mm256_andnot_si256(is_cont, ones);
acc = _mm256_add_epi8(acc, non_cont);
batch += 1;
if batch >= 255 {
let sad = _mm256_sad_epu8(acc, zero);
let hi = _mm256_extracti128_si256(sad, 1);
let lo = _mm256_castsi256_si128(sad);
let sum = _mm_add_epi64(lo, hi);
let hi64 = _mm_unpackhi_epi64(sum, sum);
let t = _mm_add_epi64(sum, hi64);
total += _mm_cvtsi128_si64(t) as u64;
acc = _mm256_setzero_si256();
batch = 0;
}
i += 32;
}
if batch > 0 {
let sad = _mm256_sad_epu8(acc, zero);
let hi = _mm256_extracti128_si256(sad, 1);
let lo = _mm256_castsi256_si128(sad);
let sum = _mm_add_epi64(lo, hi);
let hi64 = _mm_unpackhi_epi64(sum, sum);
let t = _mm_add_epi64(sum, hi64);
total += _mm_cvtsi128_si64(t) as u64;
}
while i < len {
total += ((*ptr.add(i) & 0xC0) != 0x80) as u64;
i += 1;
}
total
}
}
fn count_chars_utf8_scalar(data: &[u8]) -> u64 {
let mut count = 0u64;
let chunks = data.chunks_exact(64);
let remainder = chunks.remainder();
for chunk in chunks {
let mut any_high = 0u8;
let mut i = 0;
while i + 8 <= 64 {
unsafe {
any_high |= *chunk.get_unchecked(i);
any_high |= *chunk.get_unchecked(i + 1);
any_high |= *chunk.get_unchecked(i + 2);
any_high |= *chunk.get_unchecked(i + 3);
any_high |= *chunk.get_unchecked(i + 4);
any_high |= *chunk.get_unchecked(i + 5);
any_high |= *chunk.get_unchecked(i + 6);
any_high |= *chunk.get_unchecked(i + 7);
}
i += 8;
}
if any_high < 0x80 {
count += 64;
continue;
}
let mut char_mask = 0u64;
i = 0;
while i + 7 < 64 {
unsafe {
char_mask |= (((*chunk.get_unchecked(i) & 0xC0) != 0x80) as u64) << i;
char_mask |= (((*chunk.get_unchecked(i + 1) & 0xC0) != 0x80) as u64) << (i + 1);
char_mask |= (((*chunk.get_unchecked(i + 2) & 0xC0) != 0x80) as u64) << (i + 2);
char_mask |= (((*chunk.get_unchecked(i + 3) & 0xC0) != 0x80) as u64) << (i + 3);
char_mask |= (((*chunk.get_unchecked(i + 4) & 0xC0) != 0x80) as u64) << (i + 4);
char_mask |= (((*chunk.get_unchecked(i + 5) & 0xC0) != 0x80) as u64) << (i + 5);
char_mask |= (((*chunk.get_unchecked(i + 6) & 0xC0) != 0x80) as u64) << (i + 6);
char_mask |= (((*chunk.get_unchecked(i + 7) & 0xC0) != 0x80) as u64) << (i + 7);
}
i += 8;
}
count += char_mask.count_ones() as u64;
}
for &b in remainder {
count += ((b & 0xC0) != 0x80) as u64;
}
count
}
#[inline]
pub fn count_chars_c(data: &[u8]) -> u64 {
data.len() as u64
}
#[inline]
pub fn count_chars(data: &[u8], utf8: bool) -> u64 {
if utf8 {
count_chars_utf8(data)
} else {
count_chars_c(data)
}
}
pub fn is_utf8_locale() -> bool {
for var in &["LC_ALL", "LC_CTYPE", "LANG"] {
if let Ok(val) = std::env::var(var) {
if !val.is_empty() {
let lower = val.to_ascii_lowercase();
return lower.contains("utf-8") || lower.contains("utf8");
}
}
}
false
}
#[inline]
fn decode_utf8(bytes: &[u8]) -> (u32, usize) {
let b0 = bytes[0];
if b0 < 0x80 {
return (b0 as u32, 1);
}
if b0 < 0xC2 {
return (b0 as u32, 1);
}
if b0 < 0xE0 {
if bytes.len() < 2 || bytes[1] & 0xC0 != 0x80 {
return (b0 as u32, 1);
}
let cp = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
return (cp, 2);
}
if b0 < 0xF0 {
if bytes.len() < 3 || bytes[1] & 0xC0 != 0x80 || bytes[2] & 0xC0 != 0x80 {
return (b0 as u32, 1);
}
let cp =
((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
return (cp, 3);
}
if b0 < 0xF5 {
if bytes.len() < 4
|| bytes[1] & 0xC0 != 0x80
|| bytes[2] & 0xC0 != 0x80
|| bytes[3] & 0xC0 != 0x80
{
return (b0 as u32, 1);
}
let cp = ((b0 as u32 & 0x07) << 18)
| ((bytes[1] as u32 & 0x3F) << 12)
| ((bytes[2] as u32 & 0x3F) << 6)
| (bytes[3] as u32 & 0x3F);
return (cp, 4);
}
(b0 as u32, 1)
}
#[inline]
fn is_zero_width(cp: u32) -> bool {
matches!(
cp,
0x0300..=0x036F | 0x0483..=0x0489 | 0x0591..=0x05BD | 0x05BF
| 0x05C1..=0x05C2
| 0x05C4..=0x05C5
| 0x05C7
| 0x0600..=0x0605 | 0x0610..=0x061A | 0x064B..=0x065F | 0x0670
| 0x06D6..=0x06DD
| 0x06DF..=0x06E4
| 0x06E7..=0x06E8
| 0x06EA..=0x06ED
| 0x070F
| 0x0711
| 0x0730..=0x074A
| 0x07A6..=0x07B0
| 0x07EB..=0x07F3
| 0x07FD
| 0x0816..=0x0819
| 0x081B..=0x0823
| 0x0825..=0x0827
| 0x0829..=0x082D
| 0x0859..=0x085B
| 0x08D3..=0x08E1
| 0x08E3..=0x0902
| 0x093A
| 0x093C
| 0x0941..=0x0948
| 0x094D
| 0x0951..=0x0957
| 0x0962..=0x0963
| 0x0981
| 0x09BC
| 0x09C1..=0x09C4
| 0x09CD
| 0x09E2..=0x09E3
| 0x09FE
| 0x0A01..=0x0A02
| 0x0A3C
| 0x0A41..=0x0A42
| 0x0A47..=0x0A48
| 0x0A4B..=0x0A4D
| 0x0A51
| 0x0A70..=0x0A71
| 0x0A75
| 0x0A81..=0x0A82
| 0x0ABC
| 0x0AC1..=0x0AC5
| 0x0AC7..=0x0AC8
| 0x0ACD
| 0x0AE2..=0x0AE3
| 0x0AFA..=0x0AFF
| 0x0B01
| 0x0B3C
| 0x0B3F
| 0x0B41..=0x0B44
| 0x0B4D
| 0x0B56
| 0x0B62..=0x0B63
| 0x0B82
| 0x0BC0
| 0x0BCD
| 0x0C00
| 0x0C04
| 0x0C3E..=0x0C40
| 0x0C46..=0x0C48
| 0x0C4A..=0x0C4D
| 0x0C55..=0x0C56
| 0x0C62..=0x0C63
| 0x0C81
| 0x0CBC
| 0x0CBF
| 0x0CC6
| 0x0CCC..=0x0CCD
| 0x0CE2..=0x0CE3
| 0x0D00..=0x0D01
| 0x0D3B..=0x0D3C
| 0x0D41..=0x0D44
| 0x0D4D
| 0x0D62..=0x0D63
| 0x0DCA
| 0x0DD2..=0x0DD4
| 0x0DD6
| 0x0E31
| 0x0E34..=0x0E3A
| 0x0E47..=0x0E4E
| 0x0EB1
| 0x0EB4..=0x0EBC
| 0x0EC8..=0x0ECD
| 0x0F18..=0x0F19
| 0x0F35
| 0x0F37
| 0x0F39
| 0x0F71..=0x0F7E
| 0x0F80..=0x0F84
| 0x0F86..=0x0F87
| 0x0F8D..=0x0F97
| 0x0F99..=0x0FBC
| 0x0FC6
| 0x102D..=0x1030
| 0x1032..=0x1037
| 0x1039..=0x103A
| 0x103D..=0x103E
| 0x1058..=0x1059
| 0x105E..=0x1060
| 0x1071..=0x1074
| 0x1082
| 0x1085..=0x1086
| 0x108D
| 0x109D
| 0x1160..=0x11FF | 0x135D..=0x135F
| 0x1712..=0x1714
| 0x1732..=0x1734
| 0x1752..=0x1753
| 0x1772..=0x1773
| 0x17B4..=0x17B5
| 0x17B7..=0x17BD
| 0x17C6
| 0x17C9..=0x17D3
| 0x17DD
| 0x180B..=0x180D
| 0x1885..=0x1886
| 0x18A9
| 0x1920..=0x1922
| 0x1927..=0x1928
| 0x1932
| 0x1939..=0x193B
| 0x1A17..=0x1A18
| 0x1A1B
| 0x1A56
| 0x1A58..=0x1A5E
| 0x1A60
| 0x1A62
| 0x1A65..=0x1A6C
| 0x1A73..=0x1A7C
| 0x1A7F
| 0x1AB0..=0x1ABE
| 0x1B00..=0x1B03
| 0x1B34
| 0x1B36..=0x1B3A
| 0x1B3C
| 0x1B42
| 0x1B6B..=0x1B73
| 0x1B80..=0x1B81
| 0x1BA2..=0x1BA5
| 0x1BA8..=0x1BA9
| 0x1BAB..=0x1BAD
| 0x1BE6
| 0x1BE8..=0x1BE9
| 0x1BED
| 0x1BEF..=0x1BF1
| 0x1C2C..=0x1C33
| 0x1C36..=0x1C37
| 0x1CD0..=0x1CD2
| 0x1CD4..=0x1CE0
| 0x1CE2..=0x1CE8
| 0x1CED
| 0x1CF4
| 0x1CF8..=0x1CF9
| 0x1DC0..=0x1DF9
| 0x1DFB..=0x1DFF
| 0x200B..=0x200F | 0x202A..=0x202E | 0x2060..=0x2064 | 0x2066..=0x206F | 0x20D0..=0x20F0 | 0xFE00..=0xFE0F | 0xFE20..=0xFE2F | 0xFEFF | 0xFFF9..=0xFFFB | 0x1D167..=0x1D169
| 0x1D173..=0x1D182
| 0x1D185..=0x1D18B
| 0x1D1AA..=0x1D1AD
| 0x1D242..=0x1D244
| 0xE0001
| 0xE0020..=0xE007F
| 0xE0100..=0xE01EF )
}
#[inline]
fn is_wide_char(cp: u32) -> bool {
matches!(
cp,
0x1100..=0x115F | 0x231A..=0x231B | 0x2329..=0x232A | 0x23E9..=0x23F3 | 0x23F8..=0x23FA
| 0x25FD..=0x25FE
| 0x2614..=0x2615
| 0x2648..=0x2653
| 0x267F
| 0x2693
| 0x26A1
| 0x26AA..=0x26AB
| 0x26BD..=0x26BE
| 0x26C4..=0x26C5
| 0x26CE
| 0x26D4
| 0x26EA
| 0x26F2..=0x26F3
| 0x26F5
| 0x26FA
| 0x26FD
| 0x2702
| 0x2705
| 0x2708..=0x270D
| 0x270F
| 0x2712
| 0x2714
| 0x2716
| 0x271D
| 0x2721
| 0x2728
| 0x2733..=0x2734
| 0x2744
| 0x2747
| 0x274C
| 0x274E
| 0x2753..=0x2755
| 0x2757
| 0x2763..=0x2764
| 0x2795..=0x2797
| 0x27A1
| 0x27B0
| 0x27BF
| 0x2934..=0x2935
| 0x2B05..=0x2B07
| 0x2B1B..=0x2B1C
| 0x2B50
| 0x2B55
| 0x2E80..=0x303E | 0x3040..=0x33BF | 0x3400..=0x4DBF | 0x4E00..=0xA4CF | 0xA960..=0xA97C | 0xAC00..=0xD7A3 | 0xF900..=0xFAFF | 0xFE10..=0xFE19 | 0xFE30..=0xFE6F | 0xFF01..=0xFF60 | 0xFFE0..=0xFFE6 | 0x1F004
| 0x1F0CF
| 0x1F170..=0x1F171
| 0x1F17E..=0x1F17F
| 0x1F18E
| 0x1F191..=0x1F19A
| 0x1F1E0..=0x1F1FF | 0x1F200..=0x1F202
| 0x1F210..=0x1F23B
| 0x1F240..=0x1F248
| 0x1F250..=0x1F251
| 0x1F260..=0x1F265
| 0x1F300..=0x1F64F | 0x1F680..=0x1F6FF | 0x1F900..=0x1F9FF | 0x1FA00..=0x1FA6F
| 0x1FA70..=0x1FAFF
| 0x20000..=0x2FFFD | 0x30000..=0x3FFFD )
}
pub fn max_line_length_c(data: &[u8]) -> u64 {
let mut max_len: u64 = 0;
let mut line_len: u64 = 0;
let mut linepos: u64 = 0;
let mut i = 0;
let len = data.len();
while i < len {
let b = unsafe { *data.get_unchecked(i) };
if b >= 0x21 && b <= 0x7E {
i += 1;
let mut run = 1u64;
while i < len {
let b = unsafe { *data.get_unchecked(i) };
if b >= 0x21 && b <= 0x7E {
run += 1;
i += 1;
} else {
break;
}
}
linepos += run;
if linepos > line_len {
line_len = linepos;
}
} else {
match b {
b' ' => {
linepos += 1;
if linepos > line_len {
line_len = linepos;
}
}
b'\n' => {
if line_len > max_len {
max_len = line_len;
}
linepos = 0;
line_len = 0;
}
b'\t' => {
linepos = (linepos + 8) & !7;
if linepos > line_len {
line_len = linepos;
}
}
b'\r' => {
linepos = 0;
}
0x0C => {
if line_len > max_len {
max_len = line_len;
}
linepos = 0;
line_len = 0;
}
_ => {} }
i += 1;
}
}
if line_len > max_len {
max_len = line_len;
}
max_len
}
pub fn max_line_length_utf8(data: &[u8]) -> u64 {
let mut max_len: u64 = 0;
let mut line_len: u64 = 0;
let mut linepos: u64 = 0;
let mut i = 0;
let len = data.len();
while i < len {
let b = unsafe { *data.get_unchecked(i) };
if b >= 0x21 && b <= 0x7E {
i += 1;
let mut run = 1u64;
while i < len {
let b = unsafe { *data.get_unchecked(i) };
if b >= 0x21 && b <= 0x7E {
run += 1;
i += 1;
} else {
break;
}
}
linepos += run;
if linepos > line_len {
line_len = linepos;
}
} else if b < 0x80 {
match b {
b' ' => {
linepos += 1;
if linepos > line_len {
line_len = linepos;
}
}
b'\n' => {
if line_len > max_len {
max_len = line_len;
}
linepos = 0;
line_len = 0;
}
b'\t' => {
linepos = (linepos + 8) & !7;
if linepos > line_len {
line_len = linepos;
}
}
b'\r' => {
linepos = 0;
}
0x0C => {
if line_len > max_len {
max_len = line_len;
}
linepos = 0;
line_len = 0;
}
_ => {} }
i += 1;
} else {
let (cp, blen) = decode_utf8(&data[i..]);
if cp <= 0x9F {
} else if is_zero_width(cp) {
} else if is_wide_char(cp) {
linepos += 2;
if linepos > line_len {
line_len = linepos;
}
} else {
linepos += 1;
if linepos > line_len {
line_len = linepos;
}
}
i += blen;
}
}
if line_len > max_len {
max_len = line_len;
}
max_len
}
#[inline]
pub fn max_line_length(data: &[u8], utf8: bool) -> u64 {
if utf8 {
max_line_length_utf8(data)
} else {
max_line_length_c(data)
}
}
pub fn count_all(data: &[u8], utf8: bool) -> WcCounts {
if utf8 {
let (lines, words) = count_lines_words_utf8_fused(data);
WcCounts {
lines,
words,
bytes: data.len() as u64,
chars: count_chars_utf8(data),
max_line_length: max_line_length_utf8(data),
}
} else {
WcCounts {
lines: count_lines(data),
words: count_words_locale(data, false),
bytes: data.len() as u64,
chars: data.len() as u64,
max_line_length: max_line_length_c(data),
}
}
}
#[inline]
fn check_ascii_sample(data: &[u8]) -> bool {
let len = data.len();
if len == 0 {
return true;
}
let check_region = |start: usize, end: usize| -> bool {
let mut or_acc = 0u8;
let region = &data[start..end];
let mut i = 0;
while i + 8 <= region.len() {
unsafe {
or_acc |= *region.get_unchecked(i);
or_acc |= *region.get_unchecked(i + 1);
or_acc |= *region.get_unchecked(i + 2);
or_acc |= *region.get_unchecked(i + 3);
or_acc |= *region.get_unchecked(i + 4);
or_acc |= *region.get_unchecked(i + 5);
or_acc |= *region.get_unchecked(i + 6);
or_acc |= *region.get_unchecked(i + 7);
}
i += 8;
}
while i < region.len() {
or_acc |= region[i];
i += 1;
}
or_acc < 0x80
};
let sample = 256.min(len);
if !check_region(0, sample) {
return false;
}
if len > sample * 2 {
let mid = len / 2;
let mid_start = mid.saturating_sub(sample / 2);
if !check_region(mid_start, (mid_start + sample).min(len)) {
return false;
}
}
if len > sample {
if !check_region(len - sample, len) {
return false;
}
}
true
}
fn split_at_newlines(data: &[u8], num_chunks: usize) -> Vec<&[u8]> {
if data.is_empty() || num_chunks <= 1 {
return vec![data];
}
let chunk_size = data.len() / num_chunks;
let mut chunks = Vec::with_capacity(num_chunks);
let mut pos = 0;
for _ in 0..num_chunks - 1 {
let target = pos + chunk_size;
if target >= data.len() {
break;
}
let boundary = memchr::memchr(b'\n', &data[target..])
.map(|p| target + p + 1)
.unwrap_or(data.len());
if boundary > pos {
chunks.push(&data[pos..boundary]);
}
pos = boundary;
}
if pos < data.len() {
chunks.push(&data[pos..]);
}
chunks
}
pub fn count_lines_parallel(data: &[u8]) -> u64 {
if data.len() < PARALLEL_THRESHOLD {
return count_lines(data);
}
let num_threads = rayon::current_num_threads().max(1);
let chunk_size = (data.len() / num_threads).max(2 * 1024 * 1024);
data.par_chunks(chunk_size)
.map(|chunk| memchr_iter(b'\n', chunk).count() as u64)
.sum()
}
pub fn count_words_parallel(data: &[u8], utf8: bool) -> u64 {
if data.len() < PARALLEL_THRESHOLD {
return count_words_locale(data, utf8);
}
let num_threads = rayon::current_num_threads().max(1);
if utf8 {
let chunks = split_at_newlines(data, num_threads);
chunks.par_iter().map(|chunk| count_words_utf8(chunk)).sum()
} else {
let chunk_size = (data.len() / num_threads).max(1024 * 1024);
let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
let results: Vec<(u64, u64, bool, bool)> = chunks
.par_iter()
.map(|chunk| count_lw_c_chunk(chunk))
.collect();
let mut total = 0u64;
for i in 0..results.len() {
total += results[i].1;
if i > 0 && results[i - 1].3 && results[i].2 {
total -= 1;
}
}
total
}
}
pub fn count_chars_parallel(data: &[u8], utf8: bool) -> u64 {
if !utf8 {
return data.len() as u64;
}
if data.len() < PARALLEL_THRESHOLD {
return count_chars_utf8(data);
}
let num_threads = rayon::current_num_threads().max(1);
let chunk_size = (data.len() / num_threads).max(1024 * 1024);
data.par_chunks(chunk_size).map(count_chars_utf8).sum()
}
pub fn count_lwb(data: &[u8], utf8: bool) -> (u64, u64, u64) {
let (lines, words) = count_lines_words(data, utf8);
(lines, words, data.len() as u64)
}
pub fn count_lwb_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
if data.len() < PARALLEL_THRESHOLD {
return count_lwb(data, utf8);
}
let num_threads = rayon::current_num_threads().max(1);
let (lines, words) = if !utf8 {
let chunk_size = (data.len() / num_threads).max(1024 * 1024);
let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
let results: Vec<(u64, u64, bool, bool)> = chunks
.par_iter()
.map(|chunk| count_lw_c_chunk_fast(chunk))
.collect();
let mut line_total = 0u64;
let mut word_total = 0u64;
for i in 0..results.len() {
line_total += results[i].0;
word_total += results[i].1;
if i > 0 && results[i - 1].3 && results[i].2 {
word_total -= 1;
}
}
(line_total, word_total)
} else {
let is_ascii = check_ascii_sample(data);
if is_ascii {
let chunk_size = (data.len() / num_threads).max(1024 * 1024);
let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
let results: Vec<(u64, u64, bool, bool)> = chunks
.par_iter()
.map(|chunk| count_lw_c_chunk_fast(chunk))
.collect();
let mut line_total = 0u64;
let mut word_total = 0u64;
for i in 0..results.len() {
line_total += results[i].0;
word_total += results[i].1;
if i > 0 && results[i - 1].3 && results[i].2 {
word_total -= 1;
}
}
(line_total, word_total)
} else {
let chunks = split_at_newlines(data, num_threads);
let results: Vec<(u64, u64)> = chunks
.par_iter()
.map(|chunk| count_lines_words_utf8_fused(chunk))
.collect();
let mut line_total = 0u64;
let mut word_total = 0u64;
for (l, w) in results {
line_total += l;
word_total += w;
}
(line_total, word_total)
}
};
(lines, words, data.len() as u64)
}
pub fn count_lwc_parallel(data: &[u8], utf8: bool) -> (u64, u64, u64) {
if data.len() < PARALLEL_THRESHOLD {
let lines = count_lines(data);
let words = count_words_locale(data, utf8);
let chars = count_chars(data, utf8);
return (lines, words, chars);
}
let num_threads = rayon::current_num_threads().max(1);
if utf8 {
let chunks = split_at_newlines(data, num_threads);
let results: Vec<(u64, u64, u64)> = chunks
.par_iter()
.map(|chunk| {
let (lines, words) = count_lines_words_utf8_fused(chunk);
let chars = count_chars_utf8(chunk);
(lines, words, chars)
})
.collect();
let mut lines = 0u64;
let mut words = 0u64;
let mut chars = 0u64;
for (l, w, c) in results {
lines += l;
words += w;
chars += c;
}
(lines, words, chars)
} else {
let chunk_size = (data.len() / num_threads).max(1024 * 1024);
let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
let results: Vec<(u64, u64, bool, bool)> = chunks
.par_iter()
.map(|chunk| count_lw_c_chunk_fast(chunk))
.collect();
let mut lines = 0u64;
let mut words = 0u64;
for i in 0..results.len() {
lines += results[i].0;
words += results[i].1;
if i > 0 && results[i - 1].3 && results[i].2 {
words -= 1;
}
}
(lines, words, data.len() as u64)
}
}
pub fn max_line_length_parallel(data: &[u8], utf8: bool) -> u64 {
if data.len() < PARALLEL_THRESHOLD {
return max_line_length(data, utf8);
}
let num_threads = rayon::current_num_threads().max(1);
let chunks = split_at_newlines(data, num_threads);
chunks
.par_iter()
.map(|chunk| {
if utf8 {
max_line_length_utf8(chunk)
} else {
max_line_length_c(chunk)
}
})
.max()
.unwrap_or(0)
}
pub fn count_all_parallel(data: &[u8], utf8: bool) -> WcCounts {
if data.len() < PARALLEL_THRESHOLD {
return count_all(data, utf8);
}
let num_threads = rayon::current_num_threads().max(1);
let chunks = split_at_newlines(data, num_threads);
if utf8 {
let results: Vec<(u64, u64, u64, u64)> = chunks
.par_iter()
.map(|chunk| {
let (lines, words) = count_lines_words_utf8_fused(chunk);
let chars = count_chars_utf8(chunk);
let max_ll = max_line_length_utf8(chunk);
(lines, words, chars, max_ll)
})
.collect();
let mut counts = WcCounts {
bytes: data.len() as u64,
..Default::default()
};
for (l, w, c, m) in results {
counts.lines += l;
counts.words += w;
counts.chars += c;
if m > counts.max_line_length {
counts.max_line_length = m;
}
}
counts
} else {
let results: Vec<(u64, u64, u64)> = chunks
.par_iter()
.map(|chunk| {
let (lines, words) = count_lines_words(chunk, false);
let max_ll = max_line_length_c(chunk);
(lines, words, max_ll)
})
.collect();
let mut counts = WcCounts {
bytes: data.len() as u64,
chars: data.len() as u64,
..Default::default()
};
for (l, w, m) in &results {
counts.lines += l;
counts.words += w;
if *m > counts.max_line_length {
counts.max_line_length = *m;
}
}
counts
}
}