1mod json;
2mod parquet;
3
4use std::path::Path;
5
6use anyhow::Result;
7pub use json::*;
8pub use parquet::*;
9use std::fs::File;
10
11use flate2::read::GzDecoder;
12use noodles::bgzf;
13use pyo3::prelude::*;
14use pyo3_stub_gen::derive::*;
15use std::io;
16use std::io::Read;
17
18#[gen_stub_pyclass_enum]
34#[pyclass(eq, eq_int, module = "deepbiop.utils")]
35#[derive(Debug, PartialEq, Clone, Eq, Hash)]
36pub enum CompressedType {
37 Uncompress,
38 Gzip,
39 Bgzip,
40 Zip,
41 Bzip2,
42 Xz,
43 Zstd,
44 Unknown,
45}
46
47pub fn check_compressed_type<P: AsRef<Path>>(file_path: P) -> Result<CompressedType> {
76 let mut file = File::open(file_path)?;
77 let mut buffer = [0u8; 18]; let bytes_read = file.read(&mut buffer)?;
81 if bytes_read < 2 {
82 return Ok(CompressedType::Uncompress);
83 }
84
85 match &buffer[..] {
87 [0x1f, 0x8b, 0x08, 0x04, ..] if bytes_read >= 18 => {
89 let xlen = u16::from_le_bytes([buffer[10], buffer[11]]) as usize;
91 if xlen >= 6 && buffer[12] == 0x42 && buffer[13] == 0x43 && buffer[14] == 0x02 && buffer[15] == 0x00
95 {
97 Ok(CompressedType::Bgzip)
98 } else {
99 Ok(CompressedType::Gzip)
100 }
101 }
102
103 [0x1f, 0x8b, ..] => Ok(CompressedType::Gzip),
105
106 [0x50, 0x4b, 0x03, 0x04, ..]
108 | [0x50, 0x4b, 0x05, 0x06, ..]
109 | [0x50, 0x4b, 0x07, 0x08, ..] => Ok(CompressedType::Zip),
110
111 [0x42, 0x5a, 0x68, ..] => Ok(CompressedType::Bzip2),
113
114 [0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00, ..] => Ok(CompressedType::Xz),
116
117 [0x28, 0xb5, 0x2f, 0xfd, ..] => Ok(CompressedType::Zstd),
119
120 _ => {
122 Ok(CompressedType::Uncompress)
124 }
125 }
126}
127
128pub fn is_compressed<P: AsRef<Path>>(file_path: P) -> Result<bool> {
146 match check_compressed_type(file_path)? {
147 CompressedType::Uncompress => Ok(false),
148 CompressedType::Unknown => Ok(false),
149 _ => Ok(true),
150 }
151}
152
153pub fn create_reader_for_compressed_file<P: AsRef<Path>>(
165 file_path: P,
166) -> Result<Box<dyn io::Read>> {
167 let compressed_type = check_compressed_type(file_path.as_ref())?;
168 let file = File::open(file_path)?;
169
170 Ok(match compressed_type {
171 CompressedType::Uncompress => Box::new(file),
172 CompressedType::Gzip => Box::new(GzDecoder::new(file)),
173 CompressedType::Bgzip => Box::new(bgzf::io::Reader::new(file)),
174 _ => return Err(anyhow::anyhow!("unsupported compression type")),
175 })
176}
177
178#[gen_stub_pyclass_enum]
180#[pyclass(eq, eq_int, module = "deepbiop.utils")]
181#[derive(Debug, PartialEq, Eq, Clone, Hash)]
182pub enum SequenceFileType {
183 Fasta,
184 Fastq,
185 Unknown,
186}
187
188pub fn check_sequence_file_type<P: AsRef<Path>>(file_path: P) -> Result<SequenceFileType> {
204 let mut reader = create_reader_for_compressed_file(file_path)?;
205 let mut buffer = [0u8; 1];
206
207 match reader.read_exact(&mut buffer) {
209 Ok(_) => match buffer[0] as char {
210 '>' => Ok(SequenceFileType::Fasta),
211 '@' => Ok(SequenceFileType::Fastq),
212 _ => Ok(SequenceFileType::Unknown),
213 },
214 Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => Ok(SequenceFileType::Unknown),
215 Err(e) => Err(e.into()),
216 }
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222 use std::io::Write;
223 use tempfile::NamedTempFile;
224
225 #[test]
226 fn test_check_file_type() -> Result<()> {
227 let mut gzip_file = NamedTempFile::new()?;
229 gzip_file.write_all(&[0x1f, 0x8b])?;
230 assert_eq!(
231 check_compressed_type(gzip_file.path())?,
232 CompressedType::Gzip
233 );
234
235 let mut bgzip_file = NamedTempFile::new()?;
237 let bgzip_header = [
238 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x42, 0x43,
239 0x02, 0x00, 0x00, 0x00,
240 ];
241 bgzip_file.write_all(&bgzip_header)?;
242 assert_eq!(
243 check_compressed_type(bgzip_file.path())?,
244 CompressedType::Bgzip
245 );
246
247 let mut zip_file = NamedTempFile::new()?;
249 zip_file.write_all(&[0x50, 0x4b, 0x03, 0x04])?;
250 assert_eq!(check_compressed_type(zip_file.path())?, CompressedType::Zip);
251
252 let mut bzip2_file = NamedTempFile::new()?;
254 bzip2_file.write_all(&[0x42, 0x5a, 0x68])?;
255 assert_eq!(
256 check_compressed_type(bzip2_file.path())?,
257 CompressedType::Bzip2
258 );
259
260 let mut xz_file = NamedTempFile::new()?;
262 xz_file.write_all(&[0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00])?;
263 assert_eq!(check_compressed_type(xz_file.path())?, CompressedType::Xz);
264
265 let mut zstd_file = NamedTempFile::new()?;
267 zstd_file.write_all(&[0x28, 0xb5, 0x2f, 0xfd])?;
268 assert_eq!(
269 check_compressed_type(zstd_file.path())?,
270 CompressedType::Zstd
271 );
272
273 let mut normal_file = NamedTempFile::new()?;
275 normal_file.write_all(b"Hello world")?;
276 assert_eq!(
277 check_compressed_type(normal_file.path())?,
278 CompressedType::Uncompress
279 );
280
281 Ok(())
282 }
283
284 #[test]
285 fn test_is_compressed() -> Result<()> {
286 let mut gzip_file = NamedTempFile::new()?;
288 gzip_file.write_all(&[0x1f, 0x8b])?;
289 assert!(is_compressed(gzip_file.path())?);
290
291 let mut normal_file = NamedTempFile::new()?;
293 normal_file.write_all(b"Hello world")?;
294 assert!(!is_compressed(normal_file.path())?);
295
296 Ok(())
297 }
298
299 #[test]
300 fn test_real_example() -> Result<()> {
301 let test1 = "./tests/data/test.fastq.gz";
302 let test2 = "./tests/data/test.fastqbgz.gz";
303 let test3 = "./tests/data/test.fastq";
304
305 assert_eq!(check_compressed_type(test1)?, CompressedType::Gzip);
306 assert_eq!(check_compressed_type(test2)?, CompressedType::Bgzip);
307 assert_eq!(check_compressed_type(test3)?, CompressedType::Uncompress);
308 Ok(())
309 }
310
311 #[test]
312 fn test_sequence_file_type() -> Result<()> {
313 let test_fq = "./tests/data/test.fastq";
314 assert_eq!(check_sequence_file_type(test_fq)?, SequenceFileType::Fastq);
315
316 let test_fa = "./tests/data/test.fa.gz";
317 assert_eq!(check_sequence_file_type(test_fa)?, SequenceFileType::Fasta);
318
319 let test_compresed_fq = "./tests/data/test.fastq.gz";
320 assert_eq!(
321 check_sequence_file_type(test_compresed_fq)?,
322 SequenceFileType::Fastq
323 );
324 Ok(())
325 }
326}