mod json;
mod parquet;
use std::path::Path;
use anyhow::Result;
pub use json::*;
pub use parquet::*;
use std::fs::File;
use flate2::read::GzDecoder;
use noodles::bgzf;
use pyo3::prelude::*;
use pyo3_stub_gen::derive::*;
use std::io;
use std::io::Read;
#[gen_stub_pyclass_enum]
#[pyclass(eq, eq_int, module = "deepbiop.utils")]
#[derive(Debug, PartialEq, Clone, Eq, Hash)]
pub enum CompressedType {
Uncompress,
Gzip,
Bgzip,
Zip,
Bzip2,
Xz,
Zstd,
Unknown,
}
pub fn check_compressed_type<P: AsRef<Path>>(file_path: P) -> Result<CompressedType> {
let mut file = File::open(file_path)?;
let mut buffer = [0u8; 18];
let bytes_read = file.read(&mut buffer)?;
if bytes_read < 2 {
return Ok(CompressedType::Uncompress);
}
match &buffer[..] {
[0x1f, 0x8b, 0x08, 0x04, ..] if bytes_read >= 18 => {
let xlen = u16::from_le_bytes([buffer[10], buffer[11]]) as usize;
if xlen >= 6 && buffer[12] == 0x42 && buffer[13] == 0x43 && buffer[14] == 0x02 && buffer[15] == 0x00
{
Ok(CompressedType::Bgzip)
} else {
Ok(CompressedType::Gzip)
}
}
[0x1f, 0x8b, ..] => Ok(CompressedType::Gzip),
[0x50, 0x4b, 0x03, 0x04, ..]
| [0x50, 0x4b, 0x05, 0x06, ..]
| [0x50, 0x4b, 0x07, 0x08, ..] => Ok(CompressedType::Zip),
[0x42, 0x5a, 0x68, ..] => Ok(CompressedType::Bzip2),
[0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00, ..] => Ok(CompressedType::Xz),
[0x28, 0xb5, 0x2f, 0xfd, ..] => Ok(CompressedType::Zstd),
_ => {
Ok(CompressedType::Uncompress)
}
}
}
pub fn is_compressed<P: AsRef<Path>>(file_path: P) -> Result<bool> {
match check_compressed_type(file_path)? {
CompressedType::Uncompress => Ok(false),
CompressedType::Unknown => Ok(false),
_ => Ok(true),
}
}
pub fn create_reader_for_compressed_file<P: AsRef<Path>>(
file_path: P,
) -> Result<Box<dyn io::Read>> {
let compressed_type = check_compressed_type(file_path.as_ref())?;
let file = File::open(file_path)?;
Ok(match compressed_type {
CompressedType::Uncompress => Box::new(file),
CompressedType::Gzip => Box::new(GzDecoder::new(file)),
CompressedType::Bgzip => Box::new(bgzf::io::Reader::new(file)),
_ => return Err(anyhow::anyhow!("unsupported compression type")),
})
}
#[gen_stub_pyclass_enum]
#[pyclass(eq, eq_int, module = "deepbiop.utils")]
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub enum SequenceFileType {
Fasta,
Fastq,
Unknown,
}
pub fn check_sequence_file_type<P: AsRef<Path>>(file_path: P) -> Result<SequenceFileType> {
let mut reader = create_reader_for_compressed_file(file_path)?;
let mut buffer = [0u8; 1];
match reader.read_exact(&mut buffer) {
Ok(_) => match buffer[0] as char {
'>' => Ok(SequenceFileType::Fasta),
'@' => Ok(SequenceFileType::Fastq),
_ => Ok(SequenceFileType::Unknown),
},
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => Ok(SequenceFileType::Unknown),
Err(e) => Err(e.into()),
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_check_file_type() -> Result<()> {
let mut gzip_file = NamedTempFile::new()?;
gzip_file.write_all(&[0x1f, 0x8b])?;
assert_eq!(
check_compressed_type(gzip_file.path())?,
CompressedType::Gzip
);
let mut bgzip_file = NamedTempFile::new()?;
let bgzip_header = [
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x42, 0x43,
0x02, 0x00, 0x00, 0x00,
];
bgzip_file.write_all(&bgzip_header)?;
assert_eq!(
check_compressed_type(bgzip_file.path())?,
CompressedType::Bgzip
);
let mut zip_file = NamedTempFile::new()?;
zip_file.write_all(&[0x50, 0x4b, 0x03, 0x04])?;
assert_eq!(check_compressed_type(zip_file.path())?, CompressedType::Zip);
let mut bzip2_file = NamedTempFile::new()?;
bzip2_file.write_all(&[0x42, 0x5a, 0x68])?;
assert_eq!(
check_compressed_type(bzip2_file.path())?,
CompressedType::Bzip2
);
let mut xz_file = NamedTempFile::new()?;
xz_file.write_all(&[0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00])?;
assert_eq!(check_compressed_type(xz_file.path())?, CompressedType::Xz);
let mut zstd_file = NamedTempFile::new()?;
zstd_file.write_all(&[0x28, 0xb5, 0x2f, 0xfd])?;
assert_eq!(
check_compressed_type(zstd_file.path())?,
CompressedType::Zstd
);
let mut normal_file = NamedTempFile::new()?;
normal_file.write_all(b"Hello world")?;
assert_eq!(
check_compressed_type(normal_file.path())?,
CompressedType::Uncompress
);
Ok(())
}
#[test]
fn test_is_compressed() -> Result<()> {
let mut gzip_file = NamedTempFile::new()?;
gzip_file.write_all(&[0x1f, 0x8b])?;
assert!(is_compressed(gzip_file.path())?);
let mut normal_file = NamedTempFile::new()?;
normal_file.write_all(b"Hello world")?;
assert!(!is_compressed(normal_file.path())?);
Ok(())
}
#[test]
fn test_real_example() -> Result<()> {
let test1 = "./tests/data/test.fastq.gz";
let test2 = "./tests/data/test.fastqbgz.gz";
let test3 = "./tests/data/test.fastq";
assert_eq!(check_compressed_type(test1)?, CompressedType::Gzip);
assert_eq!(check_compressed_type(test2)?, CompressedType::Bgzip);
assert_eq!(check_compressed_type(test3)?, CompressedType::Uncompress);
Ok(())
}
#[test]
fn test_sequence_file_type() -> Result<()> {
let test_fq = "./tests/data/test.fastq";
assert_eq!(check_sequence_file_type(test_fq)?, SequenceFileType::Fastq);
let test_fa = "./tests/data/test.fa.gz";
assert_eq!(check_sequence_file_type(test_fa)?, SequenceFileType::Fasta);
let test_compresed_fq = "./tests/data/test.fastq.gz";
assert_eq!(
check_sequence_file_type(test_compresed_fq)?,
SequenceFileType::Fastq
);
Ok(())
}
}