geopagg 0.3.1

An implementation of the geopagg algorithm for p-value aggregation of sgRNAs across gene groupings
Documentation
use std::collections::HashMap;

use itertools::Itertools;

/// Exact string matching for a needle in a haystack
pub fn index_mask(needle: &str, haystack: &[String]) -> Vec<usize> {
    haystack
        .iter()
        .enumerate()
        .filter(|(_, target)| *target == needle)
        .map(|(i, _)| i)
        .collect()
}

/// Prefix matching for a needle in a haystack
pub fn index_prefix_mask(prefix: &str, haystack: &[String]) -> Vec<usize> {
    haystack
        .iter()
        .enumerate()
        .filter(|(_, target)| target.contains(prefix))
        .map(|(i, _)| i)
        .collect()
}

/// Transforms a vector of values into z-scores
pub fn zscore_transform(values: &[f64]) -> Vec<f64> {
    let mean = values.iter().sum::<f64>() / values.len() as f64;
    let variance = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / values.len() as f64;
    let std_dev = variance.sqrt();
    values.iter().map(|x| (x - mean) / std_dev).collect()
}

pub fn select_indices<T: Copy>(indices: &[usize], data: &[T]) -> Vec<T> {
    indices.iter().map(|i| data[*i]).collect()
}

/// Calculates the membership sizes of each gene and the occurence of each membership size
pub fn calculate_group_sizes(genes: &[String], unique_genes: &[&String]) -> Vec<(usize, usize)> {
    let mut group_sizes = HashMap::new();
    for gene in unique_genes {
        let gene_indices = index_mask(gene, genes);
        let membership_size = gene_indices.len();
        if let Some(group_size) = group_sizes.get_mut(&membership_size) {
            *group_size += 1;
        } else {
            group_sizes.insert(membership_size, 1);
        }
    }
    group_sizes.into_iter().sorted().collect()
}

#[cfg(test)]
mod testing {
    use itertools::Itertools;

    #[test]
    fn test_index_mask() {
        let haystack = vec!["a".to_string(), "b".to_string(), "c".to_string()];
        let needle = "a";
        let result = super::index_mask(needle, &haystack);
        assert_eq!(result, vec![0]);
    }

    #[test]
    fn test_index_mask_shared_prefix() {
        let haystack = vec![
            "gene_a".to_string(),
            "gene_a_b".to_string(),
            "gene_a_c".to_string(),
        ];
        let needle = "gene_a";
        let result = super::index_prefix_mask(needle, &haystack);
        assert_eq!(result, vec![0, 1, 2]);
    }

    #[test]
    fn test_index_prefix_mask() {
        let haystack = vec![
            "non-targeting_0001".to_string(),
            "non-targeting_0002".to_string(),
            "some_gene".to_string(),
            "non-targeting_0003".to_string(),
        ];
        let needle = "non-targeting";
        let result = super::index_prefix_mask(needle, &haystack);
        assert_eq!(result, vec![0, 1, 3]);
    }

    #[test]
    fn test_index_prefix_mask_shared_prefix() {
        let haystack = vec![
            "gene_a".to_string(),
            "gene_a_b".to_string(),
            "gene_a_c".to_string(),
        ];
        let needle = "gene_a";
        let result = super::index_mask(needle, &haystack);
        assert_eq!(result, vec![0]);
    }

    #[test]
    fn test_select_indices() {
        let data = vec!["a", "b", "c"];
        let indices = vec![0, 2];
        let result = super::select_indices(&indices, &data);
        assert_eq!(result, vec!["a", "c"]);
    }

    #[test]
    fn test_calculate_group_sizes() {
        let genes = vec![
            "a".to_string(),
            "a".to_string(),
            "b".to_string(),
            "c".to_string(),
        ];
        let unique_genes = [String::from("a"), String::from("b"), String::from("c")];
        let unique_gene_references = unique_genes.iter().collect::<Vec<_>>();
        let result = super::calculate_group_sizes(&genes, &unique_gene_references);

        assert!(result.iter().contains(&(1, 2)));
        assert!(result.iter().contains(&(2, 1)));
        assert_eq!(result.len(), 2);
    }

    #[test]
    fn test_calculate_group_sizes_with_duplicates() {
        let genes = vec![
            "a".to_string(),
            "a".to_string(),
            "b".to_string(),
            "b".to_string(),
            "c".to_string(),
        ];
        let unique_genes = [String::from("a"), String::from("b"), String::from("c")];
        let unique_gene_references = unique_genes.iter().collect::<Vec<_>>();
        let result = super::calculate_group_sizes(&genes, &unique_gene_references);

        assert!(result.iter().contains(&(2, 2)));
        assert!(result.iter().contains(&(1, 1)));
        assert_eq!(result.len(), 2);
    }

    #[test]
    fn test_zscore_transform() {
        let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let z = super::zscore_transform(&x);
        let expected = [
            -f64::sqrt(2.0),
            -0.7071067811865475,
            0.0,
            0.7071067811865475,
            f64::sqrt(2.0),
        ];
        for (a, b) in z.iter().zip(expected.iter()) {
            assert!((a - b).abs() < 1e-10);
        }
    }

    #[test]
    fn test_zscore_transform_ooo() {
        let x = vec![5.0, 4.0, 3.0, 2.0, 1.0];
        let z = super::zscore_transform(&x);
        let expected = [
            f64::sqrt(2.0),
            0.7071067811865475,
            0.0,
            -0.7071067811865475,
            -f64::sqrt(2.0),
        ];
        for (a, b) in z.iter().zip(expected.iter()) {
            assert!((a - b).abs() < 1e-10);
        }
    }
}