deepbiop_utils/
io.rs

1mod json;
2mod parquet;
3
4use std::path::Path;
5
6use anyhow::Result;
7pub use json::*;
8pub use parquet::*;
9use std::fs::File;
10
11use flate2::read::GzDecoder;
12use noodles::bgzf;
13use pyo3::prelude::*;
14use pyo3_stub_gen::derive::*;
15use std::io;
16use std::io::Read;
17
18/// Represents different types of file compression formats
19///
20/// This enum is used to identify and handle various compression formats commonly used for files.
21/// It can be used in Python through the deepbiop.utils module.
22///
23/// # Variants
24///
25/// * `Uncompress` - Uncompressed/raw file format
26/// * `Gzip` - Standard gzip compression (.gz files)
27/// * `Bgzip` - Blocked gzip format, commonly used in bioinformatics
28/// * `Zip` - ZIP archive format
29/// * `Bzip2` - bzip2 compression format
30/// * `Xz` - XZ compression format (LZMA2)
31/// * `Zstd` - Zstandard compression format
32/// * `Unknown` - Unknown or unrecognized compression format
33#[gen_stub_pyclass_enum]
34#[pyclass(eq, eq_int, module = "deepbiop.utils")]
35#[derive(Debug, PartialEq, Clone, Eq, Hash)]
36pub enum CompressedType {
37    Uncompress,
38    Gzip,
39    Bgzip,
40    Zip,
41    Bzip2,
42    Xz,
43    Zstd,
44    Unknown,
45}
46
47/// Determines the compression type of a file by examining its header/signature
48///
49/// This function reads the first few bytes of a file and checks for known magic numbers
50/// or file signatures to identify the compression format used.
51///
52/// # Arguments
53///
54/// * `file_path` - Path to the file to check, can be any type that converts to a Path
55///
56/// # Returns
57///
58/// * `Result<CompressedType>` - The detected compression type wrapped in a Result
59///
60/// # Errors
61///
62/// Returns an error if:
63/// * The file cannot be opened
64/// * There are issues reading the file header
65///
66/// # Examples
67///
68/// ```no_run
69/// use deepbiop_utils::io::check_compressed_type;
70/// use std::path::Path;
71///
72/// let file_path = Path::new("test.gz");
73/// let compression = check_compressed_type(file_path).unwrap();
74/// ```
75pub fn check_compressed_type<P: AsRef<Path>>(file_path: P) -> Result<CompressedType> {
76    let mut file = File::open(file_path)?;
77    let mut buffer = [0u8; 18]; // Large enough for BGZF detection
78
79    // Read the first few bytes
80    let bytes_read = file.read(&mut buffer)?;
81    if bytes_read < 2 {
82        return Ok(CompressedType::Uncompress);
83    }
84
85    // Check magic numbers/file signatures
86    match &buffer[..] {
87        // Check for BGZF first (starts with gzip magic number + specific extra fields)
88        [0x1f, 0x8b, 0x08, 0x04, ..] if bytes_read >= 18 => {
89            // Check for BGZF extra field
90            let xlen = u16::from_le_bytes([buffer[10], buffer[11]]) as usize;
91            if xlen >= 6 && buffer[12] == 0x42  // B
92                && buffer[13] == 0x43  // C
93                && buffer[14] == 0x02  // Length of subfield (2)
94                && buffer[15] == 0x00
95            // Length of subfield (2)
96            {
97                Ok(CompressedType::Bgzip)
98            } else {
99                Ok(CompressedType::Gzip)
100            }
101        }
102
103        // Regular Gzip: starts with 0x1F 0x8B
104        [0x1f, 0x8b, ..] => Ok(CompressedType::Gzip),
105
106        // Zip: starts with "PK\x03\x04" or "PK\x05\x06" (empty archive) or "PK\x07\x08" (spanned archive)
107        [0x50, 0x4b, 0x03, 0x04, ..]
108        | [0x50, 0x4b, 0x05, 0x06, ..]
109        | [0x50, 0x4b, 0x07, 0x08, ..] => Ok(CompressedType::Zip),
110
111        // Bzip2: starts with "BZh"
112        [0x42, 0x5a, 0x68, ..] => Ok(CompressedType::Bzip2),
113
114        // XZ: starts with 0xFD "7zXZ"
115        [0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00, ..] => Ok(CompressedType::Xz),
116
117        // Zstandard: starts with magic number 0xFD2FB528
118        [0x28, 0xb5, 0x2f, 0xfd, ..] => Ok(CompressedType::Zstd),
119
120        // If no compression signature is found, assume it's a normal file
121        _ => {
122            // Additional check for text/binary content could be added here
123            Ok(CompressedType::Uncompress)
124        }
125    }
126}
127
128/// Checks if a file is compressed by examining its file signature/magic numbers
129///
130/// # Arguments
131/// * `file_path` - Path to the file to check
132///
133/// # Returns
134/// * `Ok(true)` if the file is compressed (gzip, bgzip, zip, bzip2, xz, zstd)
135/// * `Ok(false)` if the file is uncompressed or compression type is unknown
136/// * `Err` if there was an error reading the file
137///
138/// # Example
139/// ```no_run
140/// use deepbiop_utils::io;
141///
142/// let is_compressed = io::is_compressed("file.gz").unwrap();
143/// assert!(is_compressed);
144/// ```
145pub fn is_compressed<P: AsRef<Path>>(file_path: P) -> Result<bool> {
146    match check_compressed_type(file_path)? {
147        CompressedType::Uncompress => Ok(false),
148        CompressedType::Unknown => Ok(false),
149        _ => Ok(true),
150    }
151}
152
153/// Creates a reader for a file that may be compressed
154///
155/// This function detects the compression type of the file and returns an appropriate reader.
156/// Currently supports uncompressed files, gzip, and bgzip formats.
157///
158/// # Arguments
159/// * `file_path` - Path to the file to read, can be compressed or uncompressed
160///
161/// # Returns
162/// * `Ok(Box<dyn io::Read>)` - A boxed reader appropriate for the file's compression
163/// * `Err` - If the file cannot be opened or has an unsupported compression type
164pub fn create_reader_for_compressed_file<P: AsRef<Path>>(
165    file_path: P,
166) -> Result<Box<dyn io::Read>> {
167    let compressed_type = check_compressed_type(file_path.as_ref())?;
168    let file = File::open(file_path)?;
169
170    Ok(match compressed_type {
171        CompressedType::Uncompress => Box::new(file),
172        CompressedType::Gzip => Box::new(GzDecoder::new(file)),
173        CompressedType::Bgzip => Box::new(bgzf::io::Reader::new(file)),
174        _ => return Err(anyhow::anyhow!("unsupported compression type")),
175    })
176}
177
178/// Represents different types of sequence file formats
179#[gen_stub_pyclass_enum]
180#[pyclass(eq, eq_int, module = "deepbiop.utils")]
181#[derive(Debug, PartialEq, Eq, Clone, Hash)]
182pub enum SequenceFileType {
183    Fasta,
184    Fastq,
185    Unknown,
186}
187
188/// Determines if a file is FASTA or FASTQ format by checking its first character
189///
190/// # Arguments
191/// * `file_path` - Path to the sequence file (can be compressed or uncompressed)
192///
193/// # Returns
194/// * `Ok(SequenceFileType)` - The detected sequence file type
195/// * `Err` - If there was an error reading the file
196///
197/// # Example
198/// ```no_run
199/// use deepbiop_utils::io;
200///
201/// let file_type = io::check_sequence_file_type("sample.fq").unwrap();
202/// ```
203pub fn check_sequence_file_type<P: AsRef<Path>>(file_path: P) -> Result<SequenceFileType> {
204    let mut reader = create_reader_for_compressed_file(file_path)?;
205    let mut buffer = [0u8; 1];
206
207    // Read the first byte
208    match reader.read_exact(&mut buffer) {
209        Ok(_) => match buffer[0] as char {
210            '>' => Ok(SequenceFileType::Fasta),
211            '@' => Ok(SequenceFileType::Fastq),
212            _ => Ok(SequenceFileType::Unknown),
213        },
214        Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => Ok(SequenceFileType::Unknown),
215        Err(e) => Err(e.into()),
216    }
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222    use std::io::Write;
223    use tempfile::NamedTempFile;
224
225    #[test]
226    fn test_check_file_type() -> Result<()> {
227        // Test gzip file
228        let mut gzip_file = NamedTempFile::new()?;
229        gzip_file.write_all(&[0x1f, 0x8b])?;
230        assert_eq!(
231            check_compressed_type(gzip_file.path())?,
232            CompressedType::Gzip
233        );
234
235        // Test bgzip file
236        let mut bgzip_file = NamedTempFile::new()?;
237        let bgzip_header = [
238            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x42, 0x43,
239            0x02, 0x00, 0x00, 0x00,
240        ];
241        bgzip_file.write_all(&bgzip_header)?;
242        assert_eq!(
243            check_compressed_type(bgzip_file.path())?,
244            CompressedType::Bgzip
245        );
246
247        // Test zip file
248        let mut zip_file = NamedTempFile::new()?;
249        zip_file.write_all(&[0x50, 0x4b, 0x03, 0x04])?;
250        assert_eq!(check_compressed_type(zip_file.path())?, CompressedType::Zip);
251
252        // Test bzip2 file
253        let mut bzip2_file = NamedTempFile::new()?;
254        bzip2_file.write_all(&[0x42, 0x5a, 0x68])?;
255        assert_eq!(
256            check_compressed_type(bzip2_file.path())?,
257            CompressedType::Bzip2
258        );
259
260        // Test xz file
261        let mut xz_file = NamedTempFile::new()?;
262        xz_file.write_all(&[0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00])?;
263        assert_eq!(check_compressed_type(xz_file.path())?, CompressedType::Xz);
264
265        // Test zstd file
266        let mut zstd_file = NamedTempFile::new()?;
267        zstd_file.write_all(&[0x28, 0xb5, 0x2f, 0xfd])?;
268        assert_eq!(
269            check_compressed_type(zstd_file.path())?,
270            CompressedType::Zstd
271        );
272
273        // Test normal file
274        let mut normal_file = NamedTempFile::new()?;
275        normal_file.write_all(b"Hello world")?;
276        assert_eq!(
277            check_compressed_type(normal_file.path())?,
278            CompressedType::Uncompress
279        );
280
281        Ok(())
282    }
283
284    #[test]
285    fn test_is_compressed() -> Result<()> {
286        // Test compressed file
287        let mut gzip_file = NamedTempFile::new()?;
288        gzip_file.write_all(&[0x1f, 0x8b])?;
289        assert!(is_compressed(gzip_file.path())?);
290
291        // Test uncompressed file
292        let mut normal_file = NamedTempFile::new()?;
293        normal_file.write_all(b"Hello world")?;
294        assert!(!is_compressed(normal_file.path())?);
295
296        Ok(())
297    }
298
299    #[test]
300    fn test_real_example() -> Result<()> {
301        let test1 = "./tests/data/test.fastq.gz";
302        let test2 = "./tests/data/test.fastqbgz.gz";
303        let test3 = "./tests/data/test.fastq";
304
305        assert_eq!(check_compressed_type(test1)?, CompressedType::Gzip);
306        assert_eq!(check_compressed_type(test2)?, CompressedType::Bgzip);
307        assert_eq!(check_compressed_type(test3)?, CompressedType::Uncompress);
308        Ok(())
309    }
310
311    #[test]
312    fn test_sequence_file_type() -> Result<()> {
313        let test_fq = "./tests/data/test.fastq";
314        assert_eq!(check_sequence_file_type(test_fq)?, SequenceFileType::Fastq);
315
316        let test_fa = "./tests/data/test.fa.gz";
317        assert_eq!(check_sequence_file_type(test_fa)?, SequenceFileType::Fasta);
318
319        let test_compresed_fq = "./tests/data/test.fastq.gz";
320        assert_eq!(
321            check_sequence_file_type(test_compresed_fq)?,
322            SequenceFileType::Fastq
323        );
324        Ok(())
325    }
326}