biofile 1.10.3

A library for reading bioinformatics related files
Documentation
use std::{
    collections::{HashMap, HashSet},
    fs::{File, OpenOptions},
    io::{BufRead, BufReader},
    iter::FromIterator,
    slice::Iter,
};

use math::{
    partition::integer_partitions::Partition,
    set::ordered_integer_set::OrderedIntegerSet, traits::Collecting,
};

use crate::error::Error;
use num::{FromPrimitive, Integer, ToPrimitive};

pub const CHROM_FIELD_INDEX: usize = 0;
pub const VARIANT_ID_FIELD_INDEX: usize = 1;
pub const COORDINATE_FIELD_INDEX: usize = 3;
pub const FIRST_ALLELE_FIELD_INDEX: usize = 4;
pub const SECOND_ALLELE_FIELD_INDEX: usize = 5;

pub type PartitionKey = String;

pub struct PlinkBim<T: Copy + FromPrimitive + Integer + ToPrimitive> {
    bim_path_list: Vec<String>,
    // maps partition_id to the file line indices
    fileline_partitions: Option<FilelinePartitions<T>>,
}

impl<T: Copy + FromPrimitive + Integer + ToPrimitive> PlinkBim<T> {
    pub fn new(bim_path_list: Vec<String>) -> Result<PlinkBim<T>, Error> {
        Ok(PlinkBim {
            bim_path_list,
            fileline_partitions: None,
        })
    }

    pub fn new_with_partitions(
        bim_path_list: Vec<String>,
        partitions: HashMap<PartitionKey, OrderedIntegerSet<T>>,
    ) -> Result<PlinkBim<T>, Error> {
        let mut bim = PlinkBim::new(bim_path_list)?;
        bim.set_fileline_partitions(Some(FilelinePartitions::new(partitions)));
        Ok(bim)
    }

    pub fn new_with_partition_file(
        bim_path_list: Vec<String>,
        partition_filepath: &str,
    ) -> Result<PlinkBim<T>, Error> {
        let bim = PlinkBim::new(bim_path_list)?;
        bim.into_partitioned_by_file(partition_filepath)
    }

    fn get_buf_list(&self) -> Result<Vec<BufReader<File>>, Error> {
        self.bim_path_list
            .iter()
            .map(|p| Ok(BufReader::new(OpenOptions::new().read(true).open(p)?)))
            .collect::<Result<Vec<BufReader<File>>, Error>>()
    }

    pub fn into_partitioned_by_file(
        mut self,
        partition_file: &str,
    ) -> Result<PlinkBim<T>, Error> {
        let partitions =
            self.get_fileline_partitions_from_partition_file(partition_file)?;

        self.set_fileline_partitions(Some(partitions));
        Ok(self)
    }

    pub fn into_partitioned_by_chrom(mut self) -> Result<PlinkBim<T>, Error> {
        let partitions = self.get_chrom_to_fileline_positions()?;
        self.set_fileline_partitions(Some(FilelinePartitions::new(partitions)));
        Ok(self)
    }

    #[inline]
    pub fn set_fileline_partitions(
        &mut self,
        partitions: Option<FilelinePartitions<T>>,
    ) {
        self.fileline_partitions = partitions;
    }

    pub fn get_chrom_to_fileline_positions(
        &mut self,
    ) -> Result<HashMap<PartitionKey, OrderedIntegerSet<T>>, Error> {
        Ok(self
            .get_all_chroms()?
            .into_iter()
            .map(|chrom| {
                let partition = self.get_chrom_fileline_positions(&chrom)?;
                Ok((chrom, partition))
            })
            .collect::<Result<
                HashMap<PartitionKey, OrderedIntegerSet<T>>,
                Error,
            >>()?)
    }

    pub fn get_fileline_partitions(&self) -> Option<FilelinePartitions<T>> {
        match &self.fileline_partitions {
            None => None,
            Some(p) => Some(p.clone()),
        }
    }

    pub fn get_fileline_partitions_by_ref(
        &self,
    ) -> Option<&FilelinePartitions<T>> {
        match &self.fileline_partitions {
            None => None,
            Some(p) => Some(p),
        }
    }

    pub fn get_fileline_partitions_or(
        &self,
        default_partition_name: &str,
        default_partition_range: OrderedIntegerSet<T>,
    ) -> FilelinePartitions<T> {
        match &self.fileline_partitions {
            None => FilelinePartitions::new(HashMap::from_iter(
                vec![(
                    default_partition_name.to_string(),
                    default_partition_range,
                )]
                .into_iter(),
            )),
            Some(p) => p.clone(),
        }
    }

    /// each line in the partition file has two fields separated by space:
    /// variant_id assigned_partition
    pub fn get_fileline_partitions_from_partition_file(
        &mut self,
        partition_file_path: &str,
    ) -> Result<FilelinePartitions<T>, Error> {
        let id_to_partition_key: HashMap<String, PartitionKey> =
            BufReader::new(
                OpenOptions::new().read(true).open(partition_file_path)?,
            )
            .lines()
            .map(|line| PlinkBim::<T>::get_id_and_partition_from_line(&line?))
            .collect::<Result<HashMap<String, PartitionKey>, Error>>()?;

        let mut partitions: HashMap<PartitionKey, Partition<T>> =
            HashMap::new();
        let mut visited_ids = HashSet::new();
        let mut num_ignored_ids = 0;

        let mut file_line_offset = T::zero();
        for (b, buf) in self.get_buf_list()?.into_iter().enumerate() {
            let mut line_index = T::zero();
            for (i, line) in buf.lines().enumerate() {
                line_index = T::from_usize(i)
                    .expect("failed to convert from usize to type T");

                let id = match line?
                    .split_whitespace()
                    .nth(VARIANT_ID_FIELD_INDEX)
                {
                    None => {
                        return Err(Error::BadFormat(format!(
                            "failed to read the variant id in line {} in bim \
                            file: {}",
                            i + 1,
                            self.bim_path_list[b]
                        )));
                    }
                    Some(id) => id.to_string(),
                };
                visited_ids.insert(id.to_string());
                match id_to_partition_key.get(&id) {
                    None => {
                        if !visited_ids.contains(&id) {
                            num_ignored_ids += 1;
                        }
                    }
                    Some(key) => partitions
                        .entry(key.to_owned())
                        .or_insert_with(OrderedIntegerSet::new)
                        .collect(file_line_offset + line_index),
                }
            }
            file_line_offset = file_line_offset + line_index + T::one();
        }

        if num_ignored_ids > 0 {
            println!("PlinkBim: num_ignored_ids: {}", num_ignored_ids);
        }
        let num_ids_not_in_bim = id_to_partition_key
            .keys()
            .filter(|&k| !visited_ids.contains(k))
            .count();
        if num_ids_not_in_bim > 0 {
            Err(Error::Generic(format!(
                "{} ID(s) from the partition file {} are not in the bim \
                files {:?}",
                num_ids_not_in_bim, partition_file_path, self.bim_path_list
            )))
        } else {
            Ok(FilelinePartitions::new(partitions))
        }
    }

    fn get_id_and_partition_from_line(
        line: &str,
    ) -> Result<(String, PartitionKey), Error> {
        let mut iter = line.split_whitespace();
        let id =
            match iter.next() {
                None => return Err(Error::BadFormat(String::from(
                    "each line in the partition file should have two fields: \
                    id partition_assignment",
                ))),
                Some(id) => id.to_string(),
            };
        match iter.next() {
            None => Err(Error::BadFormat(String::from(
                "each line in the partition file should have two fields: \
                id partition_assignment",
            ))),
            Some(partition) => Ok((id, partition.to_string())),
        }
    }

    #[inline]
    pub fn get_bim_path_list(&self) -> &Vec<String> {
        &self.bim_path_list
    }

    #[allow(clippy::iter_nth_zero)]
    pub fn get_all_chroms(&mut self) -> Result<HashSet<String>, Error> {
        Ok(self
            .get_buf_list()?
            .into_iter()
            .enumerate()
            .map(|(b, buf)| {
                buf.lines()
                    .enumerate()
                    .map(|(i, l)| {
                        Ok(l?
                            .split_whitespace()
                            .nth(CHROM_FIELD_INDEX)
                            .ok_or_else(|| {
                                Error::Generic(format!(
                                   "failed to extract chromosome id from line \
                                   {} in file {}",
                                   i + 1,
                                   self.bim_path_list[b]
                               ))
                            })?
                            .to_string())
                    })
                    .collect::<Result<HashSet<String>, Error>>()
            })
            .collect::<Result<Vec<HashSet<String>>, Error>>()?
            .into_iter()
            .flatten()
            .collect())
    }

    #[allow(clippy::blocks_in_if_conditions, clippy::iter_nth_zero)]
    pub fn get_chrom_fileline_positions(
        &mut self,
        chrom: &str,
    ) -> Result<OrderedIntegerSet<T>, Error> {
        let mut set = OrderedIntegerSet::new();
        let mut file_line_offset = T::zero();

        for (b, buf) in self.get_buf_list()?.into_iter().enumerate() {
            let mut line_index = T::zero();
            for (i, line) in buf.lines().enumerate() {
                line_index =
                    T::from_usize(i).expect("failed to convert usize to T");

                if line?.split_whitespace().nth(CHROM_FIELD_INDEX).ok_or_else(
                    || {
                        Error::Generic(format!(
                            "failed to extract chromosome id from line {} in \
                            file {}",
                            i + 1,
                            self.bim_path_list[b]
                        ))
                    },
                )? == chrom
                {
                    set.collect(file_line_offset + line_index);
                }
            }
            file_line_offset = file_line_offset + line_index + T::one();
        }
        Ok(set)
    }
}

#[derive(Clone, Debug)]
pub struct FilelinePartitions<T: Copy + Integer + ToPrimitive> {
    partitions: HashMap<PartitionKey, Partition<T>>,
    ordered_partition_keys: Vec<String>,
}

impl<T: Copy + Integer + ToPrimitive> FilelinePartitions<T> {
    pub fn new(
        partitions: HashMap<PartitionKey, Partition<T>>,
    ) -> FilelinePartitions<T> {
        let ordered_partition_keys =
            FilelinePartitions::<T>::get_ordered_keys(&partitions);

        FilelinePartitions {
            partitions,
            ordered_partition_keys,
        }
    }

    fn get_ordered_keys(
        partitions: &HashMap<PartitionKey, Partition<T>>,
    ) -> Vec<PartitionKey> {
        let mut keys: Vec<PartitionKey> =
            partitions.keys().map(|s| s.to_string()).collect();

        if keys.iter().filter(|&k| k.parse::<i64>().is_err()).count() > 0 {
            keys.sort();
        } else {
            keys.sort_by_key(|k| k.parse::<i64>().unwrap());
        }
        keys
    }

    #[inline]
    pub fn ordered_partition_keys(&self) -> &Vec<PartitionKey> {
        &self.ordered_partition_keys
    }

    pub fn ordered_partition_array(&self) -> Vec<Partition<T>> {
        self.iter().map(|(_, p)| p.clone()).collect()
    }

    #[inline]
    pub fn to_hash_map(&self) -> HashMap<PartitionKey, Partition<T>> {
        self.partitions.clone()
    }

    #[inline]
    pub fn into_hash_map(self) -> HashMap<PartitionKey, Partition<T>> {
        self.partitions
    }

    pub fn iter(&self) -> FilelinePartitionsIter<T> {
        FilelinePartitionsIter {
            partitions: &self.partitions,
            key_iter: self.ordered_partition_keys.iter(),
        }
    }
}

pub struct FilelinePartitionsIter<'a, T: Copy + Integer + ToPrimitive> {
    partitions: &'a HashMap<String, Partition<T>>,
    key_iter: Iter<'a, String>,
}

impl<'a, T: Copy + Integer + ToPrimitive> Iterator
    for FilelinePartitionsIter<'a, T>
{
    type Item = (&'a str, &'a OrderedIntegerSet<T>);

    fn next(&mut self) -> Option<Self::Item> {
        match self.key_iter.next() {
            None => None,
            Some(key) => Some((key, &self.partitions[key])),
        }
    }
}

#[cfg(test)]
mod tests {
    use crate::plink_bim::PlinkBim;
    use math::set::{
        contiguous_integer_set::ContiguousIntegerSet,
        ordered_integer_set::OrderedIntegerSet,
    };
    use std::{
        collections::{HashMap, HashSet},
        fs::OpenOptions,
        io::{BufWriter, Write},
    };
    use tempfile::NamedTempFile;

    #[inline]
    fn write_bim_line<W: Write>(
        buf_writer: &mut BufWriter<W>,
        chrom: &str,
        id: &str,
        coordinate: u64,
        first_allele: char,
        second_allele: char,
    ) {
        buf_writer
            .write_fmt(format_args!(
                "{} {} {} {} {} {}\n",
                chrom, id, 0, coordinate, first_allele, second_allele
            ))
            .unwrap();
    }

    #[test]
    fn test_get_chrom_to_positions() {
        type Coordinate = i64;

        let bim_file_1 = NamedTempFile::new().unwrap();
        {
            let mut writer = BufWriter::new(&bim_file_1);
            for (chrom, coord, id) in &[
                (1, 2, "chr1:2"),
                (1, 3, "chr1:3"),
                (1, 4, "chr1:4"),
                (1, 7, "chr1:7"),
                (1, 8, "chr1:8"),
                (1, 20, "chr1:20"),
                (3, 4, "chr3:4"),
                (3, 5, "chr3:5"),
                (3, 6, "chr3:6"),
                (3, 10, "chr3:10"),
                (4, 100, "chr4:100"),
                (5, 1, "chr5:1"),
                (5, 10, "chr5:10"),
                (3, 32, "chr3:32"),
                (3, 2, "chr3:2"),
                (3, 1, "chr3:1"),
                (5, 4, "chr5:4"),
                (5, 8, "chr5:8"),
            ] {
                write_bim_line(
                    &mut writer,
                    &chrom.to_string(),
                    id,
                    *coord,
                    'A',
                    'C',
                );
            }
        }
        let bim_file_2 = NamedTempFile::new().unwrap();
        {
            let mut writer = BufWriter::new(&bim_file_2);
            for (chrom, coord, id) in &[
                (1, 0, "chr1:0"),
                (1, 200, "chr1:200"),
                (5, 1000, "chr5:1000"),
                (12, 100, "chr12:100"),
            ] {
                write_bim_line(
                    &mut writer,
                    &chrom.to_string(),
                    id,
                    *coord,
                    'A',
                    'C',
                );
            }
        }
        let bim_temp_path_1 = bim_file_1.into_temp_path();
        let bim_temp_path_2 = bim_file_2.into_temp_path();
        let mut bim = PlinkBim::new(vec![
            bim_temp_path_1.to_str().unwrap().to_string(),
            bim_temp_path_2.to_str().unwrap().to_string(),
        ])
        .unwrap();
        let positions = bim.get_chrom_to_fileline_positions().unwrap();
        let expected: HashMap<String, OrderedIntegerSet<Coordinate>> = vec![
            (
                "1".to_string(),
                OrderedIntegerSet::from(vec![
                    ContiguousIntegerSet::new(0, 5),
                    ContiguousIntegerSet::new(18, 19),
                ]),
            ),
            (
                "3".to_string(),
                OrderedIntegerSet::from(vec![
                    ContiguousIntegerSet::new(6, 9),
                    ContiguousIntegerSet::new(13, 15),
                ]),
            ),
            (
                "4".to_string(),
                OrderedIntegerSet::from(vec![ContiguousIntegerSet::new(
                    10, 10,
                )]),
            ),
            (
                "5".to_string(),
                OrderedIntegerSet::from(vec![
                    ContiguousIntegerSet::new(11, 12),
                    ContiguousIntegerSet::new(16, 17),
                    ContiguousIntegerSet::new(20, 20),
                ]),
            ),
            (
                "12".to_string(),
                OrderedIntegerSet::from(vec![ContiguousIntegerSet::new(
                    21, 21,
                )]),
            ),
        ]
        .into_iter()
        .collect();
        assert_eq!(positions, expected);
    }

    #[test]
    fn test_get_all_chroms() {
        type Coordinate = i64;
        let file = NamedTempFile::new().unwrap();
        {
            let mut writer = BufWriter::new(&file);
            for chrom in &[1, 2, 1, 3, 2, 5] {
                write_bim_line(
                    &mut writer,
                    &chrom.to_string(),
                    "id",
                    0,
                    'A',
                    'C',
                );
            }
        }
        let bim_temp_path = file.into_temp_path();
        let mut bim = PlinkBim::<Coordinate>::new(vec![bim_temp_path
            .to_str()
            .unwrap()
            .to_string()])
        .unwrap();

        let chrom_set = bim.get_all_chroms().unwrap();
        let expected: HashSet<String> = vec!["1", "2", "3", "5"]
            .into_iter()
            .map(|x| x.to_string())
            .collect();
        assert_eq!(chrom_set, expected);
    }

    fn create_dummy_bim() -> (NamedTempFile, NamedTempFile) {
        let bim_temp_file = NamedTempFile::new().unwrap();
        {
            let mut writer = BufWriter::new(&bim_temp_file);
            for (chrom, coord, id) in &[
                (1, 2, "chr1:2"),
                (1, 3, "chr1:3"),
                (1, 4, "chr1:4"),
                (1, 7, "chr1:7"),
                (1, 8, "chr1:8"),
                (1, 20, "chr1:20"),
                (3, 4, "chr3:4"),
                (3, 5, "chr3:5"),
                (3, 6, "chr3:6"),
                (3, 10, "chr3:10"),
                (4, 100, "chr4:100"),
                (5, 1, "chr5:1"),
                (5, 10, "chr5:10"),
                (3, 32, "chr3:32"),
                (3, 2, "chr3:2"),
                (3, 1, "chr3:1"),
                (5, 4, "chr5:4"),
                (5, 8, "chr5:8"),
            ] {
                write_bim_line(
                    &mut writer,
                    &chrom.to_string(),
                    id,
                    *coord,
                    'A',
                    'C',
                );
            }
        }

        let partition_file = NamedTempFile::new().unwrap();
        {
            let mut writer = BufWriter::new(&partition_file);
            for (id, partition) in &[
                ("chr1:4", "p1"),
                ("chr1:7", "p2"),
                ("chr1:8", "p2"),
                ("chr1:20", "p1"),
                ("chr5:1", "p3"),
                ("chr5:10", "p1"),
                ("chr3:4", "p2"),
                ("chr3:5", "p3"),
                ("chr3:6", "p2"),
                ("chr3:10", "p4"),
                ("chr3:32", "p1"),
                ("chr3:2", "p1"),
                ("chr3:1", "p1"),
                ("chr5:4", "p3"),
                ("chr5:8", "p2"),
                ("chr4:100", "p1"),
                ("chr1:2", "p3"),
                ("chr1:3", "p1"),
            ] {
                writer
                    .write_fmt(format_args!("{} {}\n", id, partition))
                    .unwrap();
            }
        }
        (partition_file, bim_temp_file)
    }

    #[test]
    fn test_get_fileline_partitions() {
        let (partition_file, bim_temp_file) = create_dummy_bim();
        let bim_temp_path = bim_temp_file.into_temp_path();
        let mut bim =
            PlinkBim::new(vec![bim_temp_path.to_str().unwrap().to_string()])
                .unwrap();

        let partition_file_path = partition_file.into_temp_path();
        let partitions = bim
            .get_fileline_partitions_from_partition_file(
                partition_file_path.to_str().unwrap(),
            )
            .unwrap()
            .partitions;
        assert_eq!(
            partitions.get("p1").unwrap(),
            &OrderedIntegerSet::from_slice(&[[1, 2], [5, 5], [10, 10], [
                12, 15
            ]])
        );
        assert_eq!(
            partitions.get("p2").unwrap(),
            &OrderedIntegerSet::from_slice(&[[3, 4], [6, 6], [8, 8], [17, 17]])
        );
        assert_eq!(
            partitions.get("p3").unwrap(),
            &OrderedIntegerSet::from_slice(&[[0, 0], [7, 7], [11, 11], [
                16, 16
            ]])
        );
        assert_eq!(
            partitions.get("p4").unwrap(),
            &OrderedIntegerSet::from_slice(&[[9, 9]])
        );

        let mut new_bim = bim
            .into_partitioned_by_file(partition_file_path.to_str().unwrap())
            .unwrap();
        assert_eq!(
            new_bim.get_fileline_partitions_by_ref().unwrap().partitions,
            partitions
        );

        {
            let mut writer = BufWriter::new(
                OpenOptions::new()
                    .write(true)
                    .append(true)
                    .open(partition_file_path.to_str().unwrap())
                    .unwrap(),
            );
            writer
                .write_fmt(format_args!("{} {}\n", "extra_id", "p2"))
                .unwrap();
        }
        assert!(new_bim
            .get_fileline_partitions_from_partition_file(
                partition_file_path.to_str().unwrap()
            )
            .is_err());
    }

    #[test]
    fn test_fileline_partitions_iter() {
        let (partition_file, bim_temp_file) = create_dummy_bim();
        let bim_temp_path = bim_temp_file.into_temp_path();
        let mut bim =
            PlinkBim::new(vec![bim_temp_path.to_str().unwrap().to_string()])
                .unwrap();

        let partition_file_path = partition_file.into_temp_path();
        let partitions = bim
            .get_fileline_partitions_from_partition_file(
                partition_file_path.to_str().unwrap(),
            )
            .unwrap();
        let mut iter = partitions.iter();
        assert_eq!(
            iter.next(),
            Some((
                "p1",
                &OrderedIntegerSet::from_slice(&[[1, 2], [5, 5], [10, 10], [
                    12, 15
                ]])
            ))
        );
        assert_eq!(
            iter.next(),
            Some((
                "p2",
                &OrderedIntegerSet::from_slice(&[[3, 4], [6, 6], [8, 8], [
                    17, 17
                ]])
            ))
        );
        assert_eq!(
            iter.next(),
            Some((
                "p3",
                &OrderedIntegerSet::from_slice(&[[0, 0], [7, 7], [11, 11], [
                    16, 16
                ]])
            ))
        );
        assert_eq!(
            iter.next(),
            Some(("p4", &OrderedIntegerSet::from_slice(&[[9, 9]])))
        );
        assert_eq!(iter.next(), None);
    }
}