kazoe 0.1.2 - Docs.rs

use memchr::memmem::Finder;

const CHUNK_SIZE: usize = 1024 * 1024;
const PARALLEL_THRESHOLD: usize = 512 * 1024;

pub fn count_lines(data: &[u8]) -> usize {
    if data.len() < PARALLEL_THRESHOLD {
        return memchr::memchr_iter(b'\n', data).count();
    }

    use rayon::prelude::*;
    data.par_chunks(CHUNK_SIZE)
        .map(|chunk| memchr::memchr_iter(b'\n', chunk).count())
        .sum()
}

pub fn count_all_words(data: &[u8]) -> usize {
    if data.is_empty() {
        return 0;
    }

    if data.len() < PARALLEL_THRESHOLD {
        return count_words_sequential(data);
    }

    use rayon::prelude::*;

    let num_chunks = data.len().div_ceil(CHUNK_SIZE);

    let count: usize = (0..num_chunks)
        .into_par_iter()
        .map(|i| {
            let start = i * CHUNK_SIZE;
            let end = ((i + 1) * CHUNK_SIZE).min(data.len());
            let chunk = &data[start..end];
            count_words_in_chunk(chunk)
        })
        .sum();

    let mut overcounted = 0;
    for i in 1..num_chunks {
        let prev_end_idx = (i * CHUNK_SIZE) - 1;
        let curr_start_idx = i * CHUNK_SIZE;

        if !data[prev_end_idx].is_ascii_whitespace() && !data[curr_start_idx].is_ascii_whitespace()
        {
            overcounted += 1;
        }
    }

    count - overcounted
}

#[inline]
fn count_words_sequential(data: &[u8]) -> usize {
    let mut count = 0;
    let mut in_word = false;

    for &byte in data {
        if byte.is_ascii_whitespace() {
            in_word = false;
        } else if !in_word {
            count += 1;
            in_word = true;
        }
    }

    count
}

#[inline]
fn count_words_in_chunk(chunk: &[u8]) -> usize {
    let mut count = 0;
    let mut in_word = false;

    for &byte in chunk {
        if byte.is_ascii_whitespace() {
            in_word = false;
        } else if !in_word {
            count += 1;
            in_word = true;
        }
    }

    count
}

pub fn count_pattern(data: &[u8], pattern: &[u8]) -> usize {
    if data.is_empty() || pattern.is_empty() {
        return 0;
    }

    let finder = Finder::new(pattern);

    if data.len() < PARALLEL_THRESHOLD {
        return finder.find_iter(data).count();
    }

    use rayon::prelude::*;

    let overlap = pattern.len() - 1;
    let num_chunks = data.len().div_ceil(CHUNK_SIZE);

    let count: usize = (0..num_chunks)
        .into_par_iter()
        .map(|i| {
            let start = i * CHUNK_SIZE;
            let end = ((i + 1) * CHUNK_SIZE).min(data.len());
            let chunk = &data[start..end];
            finder.find_iter(chunk).count()
        })
        .sum();

    let mut missed = 0;
    for i in 1..num_chunks {
        let prev_chunk_end = i * CHUNK_SIZE;
        let boundary_start = prev_chunk_end.saturating_sub(overlap);
        let boundary_end = (prev_chunk_end + overlap).min(data.len());

        let boundary = &data[boundary_start..boundary_end];
        missed += finder.find_iter(boundary).count();
    }

    count + missed
}

pub fn count_chars(data: &[u8]) -> usize {
    if data.is_empty() {
        return 0;
    }

    if data.len() < PARALLEL_THRESHOLD {
        return std::str::from_utf8(data)
            .map(|s| s.chars().count())
            .unwrap_or(data.len());
    }

    use rayon::prelude::*;

    let num_chunks = data.len().div_ceil(CHUNK_SIZE);

    (0..num_chunks)
        .into_par_iter()
        .map(|i| {
            let start = i * CHUNK_SIZE;
            let end = ((i + 1) * CHUNK_SIZE).min(data.len());
            let chunk = &data[start..end];

            std::str::from_utf8(chunk)
                .map(|s| s.chars().count())
                .unwrap_or(chunk.len())
        })
        .sum()
}

pub fn max_line_length(data: &[u8]) -> usize {
    if data.is_empty() {
        return 0;
    }

    if data.len() < PARALLEL_THRESHOLD {
        return max_line_length_sequential(data);
    }

    use rayon::prelude::*;

    let num_chunks = data.len().div_ceil(CHUNK_SIZE);

    (0..num_chunks)
        .into_par_iter()
        .map(|i| {
            let start = i * CHUNK_SIZE;
            let end = ((i + 1) * CHUNK_SIZE).min(data.len());
            let chunk = &data[start..end];
            max_line_length_sequential(chunk)
        })
        .max()
        .unwrap_or(0)
}

#[inline]
fn max_line_length_sequential(data: &[u8]) -> usize {
    let mut max_len = 0;
    let mut current_len = 0;

    for &byte in data {
        if byte == b'\n' {
            max_len = max_len.max(current_len);
            current_len = 0;
        } else {
            current_len += 1;
        }
    }

    max_len.max(current_len)
}

pub fn is_binary(data: &[u8]) -> bool {
    let sample_size = data.len().min(8192);
    let sample = &data[..sample_size];

    for &byte in sample {
        if byte == 0 || (byte < 32 && byte != b'\n' && byte != b'\r' && byte != b'\t') {
            return true;
        }
    }
    false
}

use std::collections::{HashMap, HashSet};

pub fn count_unique_words(data: &[u8]) -> usize {
    let text = match std::str::from_utf8(data) {
        Ok(s) => s,
        Err(_) => return 0,
    };

    let words: HashSet<&str> = text
        .split(|c: char| c.is_ascii_whitespace())
        .filter(|w| !w.is_empty())
        .collect();

    words.len()
}

pub struct Statistics {
    pub mean_line_length: f64,
    pub median_line_length: usize,
    pub std_dev: f64,
    pub min_line_length: usize,
    pub max_line_length: usize,
    pub empty_lines: usize,
}

pub fn calculate_statistics(data: &[u8]) -> Statistics {
    let mut line_lengths = Vec::new();
    let mut current_len = 0;
    let mut empty_lines = 0;

    for &byte in data {
        if byte == b'\n' {
            line_lengths.push(current_len);
            if current_len == 0 {
                empty_lines += 1;
            }
            current_len = 0;
        } else {
            current_len += 1;
        }
    }

    if current_len > 0 || !data.is_empty() {
        line_lengths.push(current_len);
        if current_len == 0 {
            empty_lines += 1;
        }
    }

    if line_lengths.is_empty() {
        return Statistics {
            mean_line_length: 0.0,
            median_line_length: 0,
            std_dev: 0.0,
            min_line_length: 0,
            max_line_length: 0,
            empty_lines: 0,
        };
    }

    let sum: usize = line_lengths.iter().sum();
    let mean = sum as f64 / line_lengths.len() as f64;

    let variance: f64 = line_lengths
        .iter()
        .map(|&len| {
            let diff = len as f64 - mean;
            diff * diff
        })
        .sum::<f64>()
        / line_lengths.len() as f64;

    let std_dev = variance.sqrt();

    line_lengths.sort_unstable();
    let median = if line_lengths.len() % 2 == 0 {
        let mid = line_lengths.len() / 2;
        (line_lengths[mid - 1] + line_lengths[mid]) / 2
    } else {
        line_lengths[line_lengths.len() / 2]
    };

    let min = *line_lengths.iter().min().unwrap_or(&0);
    let max = *line_lengths.iter().max().unwrap_or(&0);

    Statistics {
        mean_line_length: mean,
        median_line_length: median,
        std_dev,
        min_line_length: min,
        max_line_length: max,
        empty_lines,
    }
}

pub fn generate_histogram(data: &[u8]) -> HashMap<usize, usize> {
    let mut histogram = HashMap::new();
    let mut current_len = 0;

    for &byte in data {
        if byte == b'\n' {
            let bucket = (current_len / 10) * 10;
            *histogram.entry(bucket).or_insert(0) += 1;
            current_len = 0;
        } else {
            current_len += 1;
        }
    }

    if current_len > 0 {
        let bucket = (current_len / 10) * 10;
        *histogram.entry(bucket).or_insert(0) += 1;
    }

    histogram
}

pub fn filter_code_comments(data: &[u8]) -> Vec<u8> {
    let text = match std::str::from_utf8(data) {
        Ok(s) => s,
        Err(_) => return data.to_vec(),
    };

    let mut result = Vec::new();
    let mut in_multiline_comment = false;

    for line in text.lines() {
        let trimmed = line.trim();

        if trimmed.starts_with("/*") {
            in_multiline_comment = true;
        }

        if in_multiline_comment {
            if trimmed.ends_with("*/") {
                in_multiline_comment = false;
            }
            continue;
        }

        if trimmed.starts_with("//") || trimmed.is_empty() {
            continue;
        }

        result.extend_from_slice(line.as_bytes());
        result.push(b'\n');
    }

    result
}

pub fn filter_markdown_code(data: &[u8]) -> Vec<u8> {
    let text = match std::str::from_utf8(data) {
        Ok(s) => s,
        Err(_) => return data.to_vec(),
    };

    let mut result = Vec::new();
    let mut in_code_block = false;

    for line in text.lines() {
        let trimmed = line.trim();

        if trimmed.starts_with("```") {
            in_code_block = !in_code_block;
            continue;
        }

        if in_code_block {
            continue;
        }

        result.extend_from_slice(line.as_bytes());
        result.push(b'\n');
    }

    result
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_count_lines_empty() {
        assert_eq!(count_lines(b""), 0);
    }

    #[test]
    fn test_count_lines_single() {
        assert_eq!(count_lines(b"hello\n"), 1);
    }

    #[test]
    fn test_count_lines_multiple() {
        assert_eq!(count_lines(b"line1\nline2\nline3\n"), 3);
    }

    #[test]
    fn test_count_lines_no_trailing_newline() {
        assert_eq!(count_lines(b"line1\nline2"), 1);
    }

    #[test]
    fn test_count_words_empty() {
        assert_eq!(count_all_words(b""), 0);
    }

    #[test]
    fn test_count_words_single() {
        assert_eq!(count_all_words(b"hello"), 1);
    }

    #[test]
    fn test_count_words_multiple() {
        assert_eq!(count_all_words(b"hello world foo bar"), 4);
    }

    #[test]
    fn test_count_words_multiple_spaces() {
        assert_eq!(count_all_words(b"hello    world"), 2);
    }

    #[test]
    fn test_count_words_newlines() {
        assert_eq!(count_all_words(b"hello\nworld\nfoo"), 3);
    }

    #[test]
    fn test_count_words_mixed_whitespace() {
        assert_eq!(count_all_words(b"hello\t\nworld  \r\nfoo"), 3);
    }

    #[test]
    fn test_count_pattern_empty_data() {
        assert_eq!(count_pattern(b"", b"test"), 0);
    }

    #[test]
    fn test_count_pattern_empty_pattern() {
        assert_eq!(count_pattern(b"test", b""), 0);
    }

    #[test]
    fn test_count_pattern_single_occurrence() {
        assert_eq!(count_pattern(b"hello world", b"world"), 1);
    }

    #[test]
    fn test_count_pattern_multiple_occurrences() {
        assert_eq!(count_pattern(b"foo bar foo baz foo", b"foo"), 3);
    }

    #[test]
    fn test_count_pattern_non_overlapping() {
        assert_eq!(count_pattern(b"aaa", b"aa"), 1);
        assert_eq!(count_pattern(b"aaaa", b"aa"), 2);
    }

    #[test]
    fn test_count_pattern_no_match() {
        assert_eq!(count_pattern(b"hello world", b"xyz"), 0);
    }

    #[test]
    fn test_count_pattern_byte_pattern() {
        assert_eq!(count_pattern(b"a\nb\nc\n", b"\n"), 3);
    }

    #[test]
    fn test_large_data_parallel() {
        let large_text = "word ".repeat(200_000);
        let bytes = large_text.as_bytes();

        assert_eq!(count_all_words(bytes), 200_000);

        let large_lines = b"line\n".repeat(200_000);
        assert_eq!(count_lines(&large_lines), 200_000);
    }

    #[test]
    fn test_chunk_boundary_words() {
        let chunk_size = CHUNK_SIZE;
        let mut data = vec![b'a'; chunk_size - 1];
        data.push(b'b');
        data.push(b'c');

        assert_eq!(count_all_words(&data), 1);

        data[chunk_size - 1] = b' ';
        assert_eq!(count_all_words(&data), 2);
    }

    #[test]
    fn test_chunk_boundary_pattern() {
        let chunk_size = CHUNK_SIZE;
        let pattern = b"boundary";
        let mut data = vec![b'x'; chunk_size - 4];
        data.extend_from_slice(pattern);
        data.extend_from_slice(b"yyyyyy");

        assert_eq!(count_pattern(&data, pattern), 1);
    }

    #[test]
    fn test_count_chars_empty() {
        assert_eq!(count_chars(b""), 0);
    }

    #[test]
    fn test_count_chars_ascii() {
        assert_eq!(count_chars(b"hello world"), 11);
    }

    #[test]
    fn test_count_chars_utf8() {
        assert_eq!(count_chars("hello 世界".as_bytes()), 8);
        assert_eq!(count_chars("🦀 Rust".as_bytes()), 6);
    }

    #[test]
    fn test_count_chars_vs_bytes() {
        let text = "café";
        assert_eq!(count_chars(text.as_bytes()), 4);
        assert_eq!(text.as_bytes().len(), 5);
    }

    #[test]
    fn test_max_line_length_empty() {
        assert_eq!(max_line_length(b""), 0);
    }

    #[test]
    fn test_max_line_length_single_line() {
        assert_eq!(max_line_length(b"hello"), 5);
    }

    #[test]
    fn test_max_line_length_multiple_lines() {
        assert_eq!(max_line_length(b"hi\nhello\nbye"), 5);
    }

    #[test]
    fn test_max_line_length_trailing_newline() {
        assert_eq!(max_line_length(b"hello\nworld\n"), 5);
    }

    #[test]
    fn test_max_line_length_empty_lines() {
        assert_eq!(max_line_length(b"\n\nhello\n\n"), 5);
    }
}