d4 0.3.9

The D4 file format implementation
Documentation
use std::{
    io::{Error, ErrorKind, Read, Result, Seek},
    path::{Path, PathBuf},
};

use d4_framefile::{Blob, Directory, OpenResult};
use reqwest::IntoUrl;

use crate::{
    d4file::validate_header,
    index::{D4IndexCollection, DataIndexRef, DataSummary, SecondaryFrameIndex},
    ptab::PRIMARY_TABLE_NAME,
    stab::{CompressionMethod, RecordBlockParsingState, SECONDARY_TABLE_NAME},
    Chrom, Header,
};

use super::{http::HttpReader, table::SecondaryTableRef, view::D4TrackView};

pub struct D4TrackReader<R: Read + Seek> {
    header: Header,
    primary_table: Blob<R>,
    secondary_table: Vec<SecondaryTableRef<R>>,
    sfi: Option<SecondaryFrameIndex>,
    track_root: Directory<R>,
}

pub struct D4MatrixReader<R: Read + Seek> {
    tracks: Vec<D4TrackReader<R>>,
}

impl D4MatrixReader<HttpReader> {
    pub fn open_tracks<U: IntoUrl + Clone, Pat: FnMut(Option<&Path>) -> bool>(
        url: U,
        pat: Pat,
    ) -> Result<D4MatrixReader<HttpReader>> {
        let mut track_to_open = vec![];
        let reader = HttpReader::new(url.clone())?;
        crate::d4file::find_tracks(reader, pat, &mut track_to_open)?;
        Ok(Self {
            tracks: track_to_open
                .into_iter()
                .map(|path| {
                    D4TrackReader::from_url_and_track_name(url.clone(), path.to_str()).unwrap()
                })
                .collect(),
        })
    }
}
impl<R: Read + Seek> D4MatrixReader<R> {
    pub fn get_view(
        &mut self,
        chrom: &str,
        begin: u32,
        end: u32,
        buf: &mut Vec<D4TrackView<R>>,
    ) -> Result<()> {
        for view in self
            .tracks
            .iter_mut()
            .map(|x| x.get_view(chrom, begin, end))
        {
            let view = view?;
            buf.push(view);
        }
        Ok(())
    }
}

impl<R: Read + Seek> D4TrackReader<R> {
    pub fn as_root(&self) -> &Directory<R> {
        &self.track_root
    }
}

#[cfg(feature = "http_reader")]
impl D4TrackReader<HttpReader> {
    pub fn from_url_and_track_name<U: IntoUrl>(url: U, track_name: Option<&str>) -> Result<Self> {
        let reader = HttpReader::new(url)?;
        Self::from_reader(reader, track_name)
    }
    pub fn from_url<U: IntoUrl>(url: U) -> Result<Self> {
        let url_str = url.as_str();
        let track_buf;
        let track_name = if let Some(pos) = url_str.rfind('#') {
            track_buf = url_str[pos..].to_string();
            Some(track_buf.as_str())
        } else {
            None
        };
        let reader = HttpReader::new(url)?;
        Self::from_reader(reader, track_name)
    }
    pub fn list_tracks<U: IntoUrl>(url: U) -> Result<Vec<PathBuf>> {
        let reader = HttpReader::new(url)?;
        let mut ret = vec![];
        crate::d4file::find_tracks(reader, |_| true, &mut ret)?;
        Ok(ret)
    }
}

impl<R: Read + Seek> D4TrackReader<R> {
    pub fn get_header(&self) -> &Header {
        &self.header
    }
    pub fn get_denominator(&self) -> Option<f64> {
        if self.header.is_integral() {
            None
        } else {
            Some(self.header.get_denominator())
        }
    }
    pub fn chrom_list(&self) -> &[Chrom] {
        self.header.chrom_list()
    }
    pub fn get_view(&mut self, chrom: &str, begin: u32, end: u32) -> Result<D4TrackView<R>> {
        let primary_offset = self.header.primary_table_offset_of_chrom(chrom);
        let primary_size = self.header.primary_table_size_of_chrom(chrom);
        if primary_size == 0 && self.header.dictionary().bit_width() != 0 {
            return Err(Error::new(ErrorKind::Other, "chrom name not found"));
        }

        let primary_view = self
            .primary_table
            .get_view(primary_offset as u64, primary_size);

        let mut secondary_view = Vec::new();

        let chrom_id = self.header.get_chrom_id(chrom).unwrap();

        for table_ref in self.secondary_table.iter() {
            if table_ref.chrom_id == chrom_id {
                let overlap_begin = table_ref.begin.max(begin);
                let overlap_end = table_ref.end.min(end);
                if overlap_begin < overlap_end {
                    if overlap_begin == table_ref.begin || self.sfi.is_none() {
                        secondary_view.push(table_ref.clone());
                    } else {
                        let sfi = self.sfi.as_ref().unwrap();
                        if let Some(addr) = sfi.find_partial_seconary_table(chrom, overlap_begin)? {
                            secondary_view.push(SecondaryTableRef::new_partial(
                                table_ref.root.clone(),
                                chrom_id,
                                overlap_begin,
                                overlap_end,
                                addr,
                                table_ref.comp,
                            ));
                        }
                    }
                }
            }
        }
        Ok(D4TrackView {
            fetch_size: 65536.min(primary_view.size()),
            primary_table: primary_view,
            secondary_tables: secondary_view.into(),
            chrom: chrom.to_string(),
            end,
            cursor: begin,
            primary_table_buffer: None,
            dictionary: self.header.dictionary().clone(),
            stream: None,
            current_record: None,
            rbp_state: RecordBlockParsingState::new(CompressionMethod::NoCompression),
            frame_decode_result: Default::default(),
        })
    }

    pub fn from_track_root(track_root: Directory<R>) -> Result<Self> {
        let header_stream = track_root.open_stream(Header::HEADER_STREAM_NAME)?;
        let header = Header::read(header_stream)?;
        let primary_table = track_root.open_blob(PRIMARY_TABLE_NAME)?;

        let secondary_table = SecondaryTableRef::create_stream_index(
            track_root.open_directory(SECONDARY_TABLE_NAME)?,
            header.chrom_list(),
        )?;

        let sfi = D4IndexCollection::from_root_container(&track_root)
            .and_then(|ic| ic.load_seconary_frame_index())
            .ok();

        Ok(D4TrackReader {
            header,
            primary_table,
            secondary_table,
            sfi,
            track_root,
        })
    }

    pub fn load_data_index<S: DataSummary>(&self) -> Result<DataIndexRef<S>> {
        let ic = D4IndexCollection::from_root_container(&self.track_root)?;
        ic.load_data_index::<S>()
    }

    pub fn from_reader(mut reader: R, track_name: Option<&str>) -> Result<Self> {
        validate_header(&mut reader)?;
        let file_root = Directory::open_root(reader, 8)?;
        let track_root = if let Some(track_name) = track_name {
            match file_root.open(track_name)? {
                OpenResult::SubDir(root) => root,
                _ => {
                    return Err(Error::new(
                        std::io::ErrorKind::Other,
                        "track root not found",
                    ));
                }
            }
        } else {
            if let Some(mut track_metadata_path) = file_root.find_first_object(".metadata") {
                track_metadata_path.pop();
                let track_root_path = track_metadata_path;
                match file_root.open(&track_root_path)? {
                    OpenResult::SubDir(root) => root,
                    _ => unreachable!(),
                }
            } else {
                return Err(std::io::Error::new(
                    std::io::ErrorKind::Other,
                    "Empty container",
                ));
            }
        };
        Self::from_track_root(track_root)
    }
}