use memchr::memmem::Finder;
const CHUNK_SIZE: usize = 1024 * 1024;
const PARALLEL_THRESHOLD: usize = 512 * 1024;
pub fn count_lines(data: &[u8]) -> usize {
if data.len() < PARALLEL_THRESHOLD {
return memchr::memchr_iter(b'\n', data).count();
}
use rayon::prelude::*;
data.par_chunks(CHUNK_SIZE)
.map(|chunk| memchr::memchr_iter(b'\n', chunk).count())
.sum()
}
pub fn count_all_words(data: &[u8]) -> usize {
if data.is_empty() {
return 0;
}
if data.len() < PARALLEL_THRESHOLD {
return count_words_sequential(data);
}
use rayon::prelude::*;
let num_chunks = data.len().div_ceil(CHUNK_SIZE);
let count: usize = (0..num_chunks)
.into_par_iter()
.map(|i| {
let start = i * CHUNK_SIZE;
let end = ((i + 1) * CHUNK_SIZE).min(data.len());
let chunk = &data[start..end];
count_words_in_chunk(chunk)
})
.sum();
let mut overcounted = 0;
for i in 1..num_chunks {
let prev_end_idx = (i * CHUNK_SIZE) - 1;
let curr_start_idx = i * CHUNK_SIZE;
if !data[prev_end_idx].is_ascii_whitespace() && !data[curr_start_idx].is_ascii_whitespace()
{
overcounted += 1;
}
}
count - overcounted
}
#[inline]
fn count_words_sequential(data: &[u8]) -> usize {
let mut count = 0;
let mut in_word = false;
for &byte in data {
if byte.is_ascii_whitespace() {
in_word = false;
} else if !in_word {
count += 1;
in_word = true;
}
}
count
}
#[inline]
fn count_words_in_chunk(chunk: &[u8]) -> usize {
let mut count = 0;
let mut in_word = false;
for &byte in chunk {
if byte.is_ascii_whitespace() {
in_word = false;
} else if !in_word {
count += 1;
in_word = true;
}
}
count
}
pub fn count_pattern(data: &[u8], pattern: &[u8]) -> usize {
if data.is_empty() || pattern.is_empty() {
return 0;
}
let finder = Finder::new(pattern);
if data.len() < PARALLEL_THRESHOLD {
return finder.find_iter(data).count();
}
use rayon::prelude::*;
let overlap = pattern.len() - 1;
let num_chunks = data.len().div_ceil(CHUNK_SIZE);
let count: usize = (0..num_chunks)
.into_par_iter()
.map(|i| {
let start = i * CHUNK_SIZE;
let end = ((i + 1) * CHUNK_SIZE).min(data.len());
let chunk = &data[start..end];
finder.find_iter(chunk).count()
})
.sum();
let mut missed = 0;
for i in 1..num_chunks {
let prev_chunk_end = i * CHUNK_SIZE;
let boundary_start = prev_chunk_end.saturating_sub(overlap);
let boundary_end = (prev_chunk_end + overlap).min(data.len());
let boundary = &data[boundary_start..boundary_end];
missed += finder.find_iter(boundary).count();
}
count + missed
}
pub fn count_chars(data: &[u8]) -> usize {
if data.is_empty() {
return 0;
}
if data.len() < PARALLEL_THRESHOLD {
return std::str::from_utf8(data)
.map(|s| s.chars().count())
.unwrap_or(data.len());
}
use rayon::prelude::*;
let num_chunks = data.len().div_ceil(CHUNK_SIZE);
(0..num_chunks)
.into_par_iter()
.map(|i| {
let start = i * CHUNK_SIZE;
let end = ((i + 1) * CHUNK_SIZE).min(data.len());
let chunk = &data[start..end];
std::str::from_utf8(chunk)
.map(|s| s.chars().count())
.unwrap_or(chunk.len())
})
.sum()
}
pub fn max_line_length(data: &[u8]) -> usize {
if data.is_empty() {
return 0;
}
if data.len() < PARALLEL_THRESHOLD {
return max_line_length_sequential(data);
}
use rayon::prelude::*;
let num_chunks = data.len().div_ceil(CHUNK_SIZE);
(0..num_chunks)
.into_par_iter()
.map(|i| {
let start = i * CHUNK_SIZE;
let end = ((i + 1) * CHUNK_SIZE).min(data.len());
let chunk = &data[start..end];
max_line_length_sequential(chunk)
})
.max()
.unwrap_or(0)
}
#[inline]
fn max_line_length_sequential(data: &[u8]) -> usize {
let mut max_len = 0;
let mut current_len = 0;
for &byte in data {
if byte == b'\n' {
max_len = max_len.max(current_len);
current_len = 0;
} else {
current_len += 1;
}
}
max_len.max(current_len)
}
pub fn is_binary(data: &[u8]) -> bool {
let sample_size = data.len().min(8192);
let sample = &data[..sample_size];
for &byte in sample {
if byte == 0 || (byte < 32 && byte != b'\n' && byte != b'\r' && byte != b'\t') {
return true;
}
}
false
}
use std::collections::{HashMap, HashSet};
pub fn count_unique_words(data: &[u8]) -> usize {
let text = match std::str::from_utf8(data) {
Ok(s) => s,
Err(_) => return 0,
};
let words: HashSet<&str> = text
.split(|c: char| c.is_ascii_whitespace())
.filter(|w| !w.is_empty())
.collect();
words.len()
}
pub struct Statistics {
pub mean_line_length: f64,
pub median_line_length: usize,
pub std_dev: f64,
pub min_line_length: usize,
pub max_line_length: usize,
pub empty_lines: usize,
}
pub fn calculate_statistics(data: &[u8]) -> Statistics {
let mut line_lengths = Vec::new();
let mut current_len = 0;
let mut empty_lines = 0;
for &byte in data {
if byte == b'\n' {
line_lengths.push(current_len);
if current_len == 0 {
empty_lines += 1;
}
current_len = 0;
} else {
current_len += 1;
}
}
if current_len > 0 || !data.is_empty() {
line_lengths.push(current_len);
if current_len == 0 {
empty_lines += 1;
}
}
if line_lengths.is_empty() {
return Statistics {
mean_line_length: 0.0,
median_line_length: 0,
std_dev: 0.0,
min_line_length: 0,
max_line_length: 0,
empty_lines: 0,
};
}
let sum: usize = line_lengths.iter().sum();
let mean = sum as f64 / line_lengths.len() as f64;
let variance: f64 = line_lengths
.iter()
.map(|&len| {
let diff = len as f64 - mean;
diff * diff
})
.sum::<f64>()
/ line_lengths.len() as f64;
let std_dev = variance.sqrt();
line_lengths.sort_unstable();
let median = if line_lengths.len() % 2 == 0 {
let mid = line_lengths.len() / 2;
(line_lengths[mid - 1] + line_lengths[mid]) / 2
} else {
line_lengths[line_lengths.len() / 2]
};
let min = *line_lengths.iter().min().unwrap_or(&0);
let max = *line_lengths.iter().max().unwrap_or(&0);
Statistics {
mean_line_length: mean,
median_line_length: median,
std_dev,
min_line_length: min,
max_line_length: max,
empty_lines,
}
}
pub fn generate_histogram(data: &[u8]) -> HashMap<usize, usize> {
let mut histogram = HashMap::new();
let mut current_len = 0;
for &byte in data {
if byte == b'\n' {
let bucket = (current_len / 10) * 10;
*histogram.entry(bucket).or_insert(0) += 1;
current_len = 0;
} else {
current_len += 1;
}
}
if current_len > 0 {
let bucket = (current_len / 10) * 10;
*histogram.entry(bucket).or_insert(0) += 1;
}
histogram
}
pub fn filter_code_comments(data: &[u8]) -> Vec<u8> {
let text = match std::str::from_utf8(data) {
Ok(s) => s,
Err(_) => return data.to_vec(),
};
let mut result = Vec::new();
let mut in_multiline_comment = false;
for line in text.lines() {
let trimmed = line.trim();
if trimmed.starts_with("/*") {
in_multiline_comment = true;
}
if in_multiline_comment {
if trimmed.ends_with("*/") {
in_multiline_comment = false;
}
continue;
}
if trimmed.starts_with("//") || trimmed.is_empty() {
continue;
}
result.extend_from_slice(line.as_bytes());
result.push(b'\n');
}
result
}
pub fn filter_markdown_code(data: &[u8]) -> Vec<u8> {
let text = match std::str::from_utf8(data) {
Ok(s) => s,
Err(_) => return data.to_vec(),
};
let mut result = Vec::new();
let mut in_code_block = false;
for line in text.lines() {
let trimmed = line.trim();
if trimmed.starts_with("```") {
in_code_block = !in_code_block;
continue;
}
if in_code_block {
continue;
}
result.extend_from_slice(line.as_bytes());
result.push(b'\n');
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_count_lines_empty() {
assert_eq!(count_lines(b""), 0);
}
#[test]
fn test_count_lines_single() {
assert_eq!(count_lines(b"hello\n"), 1);
}
#[test]
fn test_count_lines_multiple() {
assert_eq!(count_lines(b"line1\nline2\nline3\n"), 3);
}
#[test]
fn test_count_lines_no_trailing_newline() {
assert_eq!(count_lines(b"line1\nline2"), 1);
}
#[test]
fn test_count_words_empty() {
assert_eq!(count_all_words(b""), 0);
}
#[test]
fn test_count_words_single() {
assert_eq!(count_all_words(b"hello"), 1);
}
#[test]
fn test_count_words_multiple() {
assert_eq!(count_all_words(b"hello world foo bar"), 4);
}
#[test]
fn test_count_words_multiple_spaces() {
assert_eq!(count_all_words(b"hello world"), 2);
}
#[test]
fn test_count_words_newlines() {
assert_eq!(count_all_words(b"hello\nworld\nfoo"), 3);
}
#[test]
fn test_count_words_mixed_whitespace() {
assert_eq!(count_all_words(b"hello\t\nworld \r\nfoo"), 3);
}
#[test]
fn test_count_pattern_empty_data() {
assert_eq!(count_pattern(b"", b"test"), 0);
}
#[test]
fn test_count_pattern_empty_pattern() {
assert_eq!(count_pattern(b"test", b""), 0);
}
#[test]
fn test_count_pattern_single_occurrence() {
assert_eq!(count_pattern(b"hello world", b"world"), 1);
}
#[test]
fn test_count_pattern_multiple_occurrences() {
assert_eq!(count_pattern(b"foo bar foo baz foo", b"foo"), 3);
}
#[test]
fn test_count_pattern_non_overlapping() {
assert_eq!(count_pattern(b"aaa", b"aa"), 1);
assert_eq!(count_pattern(b"aaaa", b"aa"), 2);
}
#[test]
fn test_count_pattern_no_match() {
assert_eq!(count_pattern(b"hello world", b"xyz"), 0);
}
#[test]
fn test_count_pattern_byte_pattern() {
assert_eq!(count_pattern(b"a\nb\nc\n", b"\n"), 3);
}
#[test]
fn test_large_data_parallel() {
let large_text = "word ".repeat(200_000);
let bytes = large_text.as_bytes();
assert_eq!(count_all_words(bytes), 200_000);
let large_lines = b"line\n".repeat(200_000);
assert_eq!(count_lines(&large_lines), 200_000);
}
#[test]
fn test_chunk_boundary_words() {
let chunk_size = CHUNK_SIZE;
let mut data = vec![b'a'; chunk_size - 1];
data.push(b'b');
data.push(b'c');
assert_eq!(count_all_words(&data), 1);
data[chunk_size - 1] = b' ';
assert_eq!(count_all_words(&data), 2);
}
#[test]
fn test_chunk_boundary_pattern() {
let chunk_size = CHUNK_SIZE;
let pattern = b"boundary";
let mut data = vec![b'x'; chunk_size - 4];
data.extend_from_slice(pattern);
data.extend_from_slice(b"yyyyyy");
assert_eq!(count_pattern(&data, pattern), 1);
}
#[test]
fn test_count_chars_empty() {
assert_eq!(count_chars(b""), 0);
}
#[test]
fn test_count_chars_ascii() {
assert_eq!(count_chars(b"hello world"), 11);
}
#[test]
fn test_count_chars_utf8() {
assert_eq!(count_chars("hello 世界".as_bytes()), 8);
assert_eq!(count_chars("🦀 Rust".as_bytes()), 6);
}
#[test]
fn test_count_chars_vs_bytes() {
let text = "café";
assert_eq!(count_chars(text.as_bytes()), 4);
assert_eq!(text.as_bytes().len(), 5);
}
#[test]
fn test_max_line_length_empty() {
assert_eq!(max_line_length(b""), 0);
}
#[test]
fn test_max_line_length_single_line() {
assert_eq!(max_line_length(b"hello"), 5);
}
#[test]
fn test_max_line_length_multiple_lines() {
assert_eq!(max_line_length(b"hi\nhello\nbye"), 5);
}
#[test]
fn test_max_line_length_trailing_newline() {
assert_eq!(max_line_length(b"hello\nworld\n"), 5);
}
#[test]
fn test_max_line_length_empty_lines() {
assert_eq!(max_line_length(b"\n\nhello\n\n"), 5);
}
}