Skip to main content

fastqc_rust/sequence/
fast5.rs

1// Fast5 (Nanopore HDF5) file reader
2// Corresponds to Sequence/Fast5File.java
3
4use std::io;
5use std::path::{Path, PathBuf};
6
7use hdf5_pure::File as Hdf5File;
8
9use crate::sequence::{Sequence, SequenceFile};
10
11/// HDF5 paths tried in order to find FASTQ data within each read.
12/// Matches the rdfPaths array in Fast5File.java.
13const FASTQ_PATHS: &[&str] = &[
14    "Analyses/Basecall_2D_000/BaseCalled_template/Fastq",
15    "Analyses/Basecall_2D_000/BaseCalled_2D/Fastq",
16    "Analyses/Basecall_1D_000/BaseCalled_template/Fastq",
17    "Analyses/Basecall_1D_000/BaseCalled_1D/Fastq",
18];
19
20pub struct Fast5File {
21    /// Pre-loaded sequences from the HDF5 file.
22    /// Java reads lazily via readPaths iterator. We pre-load all sequences
23    /// at open time since the file must be fully in memory for hdf5-pure anyway.
24    sequences: Vec<Sequence>,
25    /// Current position in the sequences vector.
26    current_index: usize,
27    /// Display name of the file.
28    name: String,
29    #[allow(dead_code)]
30    file_path: PathBuf,
31}
32
33impl Fast5File {
34    /// Open a Fast5 file and extract all FASTQ sequences from it.
35    ///
36    /// Matches Fast5File constructor. Handles both single-read files
37    /// (FASTQ data at top level) and multi-read files (read_XXX/ subfolders).
38    pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Self> {
39        let path = path.as_ref();
40        let name = path
41            .file_name()
42            .map(|n| n.to_string_lossy().into_owned())
43            .unwrap_or_else(|| path.to_string_lossy().into_owned());
44
45        let hdf5 = Hdf5File::open(path).map_err(|e| {
46            io::Error::new(
47                io::ErrorKind::InvalidData,
48                format!("Failed to open HDF5 file {}: {}", path.display(), e),
49            )
50        })?;
51
52        // Check for multi-read structure (read_XXX/ folders at top level).
53        // If present, each subfolder is a separate read. Otherwise, the top level IS the read.
54        let root = hdf5.root();
55        let top_groups = root.groups().map_err(|e| {
56            io::Error::new(
57                io::ErrorKind::InvalidData,
58                format!("Failed to list groups in {}: {}", path.display(), e),
59            )
60        })?;
61
62        let read_prefixes: Vec<String> = top_groups
63            .into_iter()
64            .filter(|name| name.starts_with("read_"))
65            .map(|name| format!("{}/", name))
66            .collect();
67
68        // If we found read_ folders, use them. Otherwise use "" (top level).
69        let prefixes = if read_prefixes.is_empty() {
70            vec![String::new()]
71        } else {
72            read_prefixes
73        };
74
75        let mut sequences = Vec::new();
76
77        for prefix in &prefixes {
78            match Self::read_fastq_from_prefix(&hdf5, prefix) {
79                Ok(seq) => sequences.push(seq),
80                Err(e) => {
81                    // Java throws SequenceFormatException if no valid paths found.
82                    // We collect what we can and report errors for individual reads.
83                    eprintln!(
84                        "Warning: Could not extract FASTQ from {}{}: {}",
85                        path.display(),
86                        prefix,
87                        e
88                    );
89                }
90            }
91        }
92
93        if sequences.is_empty() {
94            return Err(io::Error::new(
95                io::ErrorKind::InvalidData,
96                format!("No valid FASTQ paths found in {}", path.display()),
97            ));
98        }
99
100        Ok(Fast5File {
101            sequences,
102            current_index: 0,
103            name,
104            file_path: path.to_path_buf(),
105        })
106    }
107
108    /// Try to read a FASTQ record from the given prefix path within the HDF5 file.
109    ///
110    /// Tries each of the rdfPaths in order until one is found.
111    fn read_fastq_from_prefix(hdf5: &Hdf5File, prefix: &str) -> io::Result<Sequence> {
112        for &fastq_path in FASTQ_PATHS {
113            let full_path = format!("{}{}", prefix, fastq_path);
114
115            if let Ok(dataset) = hdf5.dataset(&full_path) {
116                // Try read_string() first (works for fixed-length HDF5 strings).
117                // Fall back to read_u8() for variable-length strings (common in real Fast5 files
118                // written by ONT tools), then decode as UTF-8.
119                if let Ok(strings) = dataset.read_string() {
120                    if let Some(fastq_str) = strings.first() {
121                        return Self::parse_fastq_string(fastq_str);
122                    }
123                } else if let Ok(bytes) = dataset.read_u8() {
124                    // Variable-length string: raw bytes, interpret as UTF-8
125                    let fastq_str = String::from_utf8(bytes)
126                        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
127                    return Self::parse_fastq_string(&fastq_str);
128                }
129            }
130        }
131
132        Err(io::Error::new(
133            io::ErrorKind::NotFound,
134            format!("No valid FASTQ data found at prefix '{}'", prefix),
135        ))
136    }
137
138    /// Parse a 4-line FASTQ string from HDF5 into a Sequence.
139    ///
140    /// Matches the `fastq.split("\\n")` parsing in Fast5File.next().
141    fn parse_fastq_string(fastq: &str) -> io::Result<Sequence> {
142        let lines: Vec<&str> = fastq.split('\n').collect();
143
144        if lines.len() < 4 {
145            return Err(io::Error::new(
146                io::ErrorKind::InvalidData,
147                format!(
148                    "Didn't get 4 sections from FASTQ string (got {})",
149                    lines.len()
150                ),
151            ));
152        }
153
154        let id = lines[0].to_string();
155        let seq_bytes = lines[1].as_bytes().to_vec();
156        let qual_bytes = lines[3].as_bytes().to_vec();
157
158        // Sequence::new handles uppercase conversion
159        Ok(Sequence::new(id, seq_bytes, qual_bytes))
160    }
161}
162
163impl SequenceFile for Fast5File {
164    fn next(&mut self) -> Option<io::Result<Sequence>> {
165        if self.current_index >= self.sequences.len() {
166            return None;
167        }
168
169        // Take ownership via swap with a cheap default, avoiding a full clone
170        // of the id, sequence, and quality vectors for every read.
171        let mut seq = Sequence::new(String::new(), Vec::new(), Vec::new());
172        std::mem::swap(&mut seq, &mut self.sequences[self.current_index]);
173        self.current_index += 1;
174        Some(Ok(seq))
175    }
176
177    fn name(&self) -> &str {
178        &self.name
179    }
180
181    fn is_colorspace(&self) -> bool {
182        false
183    }
184
185    fn percent_complete(&self) -> f64 {
186        if self.current_index >= self.sequences.len() {
187            return 100.0;
188        }
189        // (readPathsIndexPosition * 100) / readPaths.length
190        (self.current_index as f64 * 100.0) / self.sequences.len() as f64
191    }
192}