fastqc_rust/sequence/
fast5.rs1use std::io;
5use std::path::{Path, PathBuf};
6
7use hdf5_pure::File as Hdf5File;
8
9use crate::sequence::{Sequence, SequenceFile};
10
11const FASTQ_PATHS: &[&str] = &[
14 "Analyses/Basecall_2D_000/BaseCalled_template/Fastq",
15 "Analyses/Basecall_2D_000/BaseCalled_2D/Fastq",
16 "Analyses/Basecall_1D_000/BaseCalled_template/Fastq",
17 "Analyses/Basecall_1D_000/BaseCalled_1D/Fastq",
18];
19
20pub struct Fast5File {
21 sequences: Vec<Sequence>,
25 current_index: usize,
27 name: String,
29 #[allow(dead_code)]
30 file_path: PathBuf,
31}
32
33impl Fast5File {
34 pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Self> {
39 let path = path.as_ref();
40 let name = path
41 .file_name()
42 .map(|n| n.to_string_lossy().into_owned())
43 .unwrap_or_else(|| path.to_string_lossy().into_owned());
44
45 let hdf5 = Hdf5File::open(path).map_err(|e| {
46 io::Error::new(
47 io::ErrorKind::InvalidData,
48 format!("Failed to open HDF5 file {}: {}", path.display(), e),
49 )
50 })?;
51
52 let root = hdf5.root();
55 let top_groups = root.groups().map_err(|e| {
56 io::Error::new(
57 io::ErrorKind::InvalidData,
58 format!("Failed to list groups in {}: {}", path.display(), e),
59 )
60 })?;
61
62 let read_prefixes: Vec<String> = top_groups
63 .into_iter()
64 .filter(|name| name.starts_with("read_"))
65 .map(|name| format!("{}/", name))
66 .collect();
67
68 let prefixes = if read_prefixes.is_empty() {
70 vec![String::new()]
71 } else {
72 read_prefixes
73 };
74
75 let mut sequences = Vec::new();
76
77 for prefix in &prefixes {
78 match Self::read_fastq_from_prefix(&hdf5, prefix) {
79 Ok(seq) => sequences.push(seq),
80 Err(e) => {
81 eprintln!(
84 "Warning: Could not extract FASTQ from {}{}: {}",
85 path.display(),
86 prefix,
87 e
88 );
89 }
90 }
91 }
92
93 if sequences.is_empty() {
94 return Err(io::Error::new(
95 io::ErrorKind::InvalidData,
96 format!("No valid FASTQ paths found in {}", path.display()),
97 ));
98 }
99
100 Ok(Fast5File {
101 sequences,
102 current_index: 0,
103 name,
104 file_path: path.to_path_buf(),
105 })
106 }
107
108 fn read_fastq_from_prefix(hdf5: &Hdf5File, prefix: &str) -> io::Result<Sequence> {
112 for &fastq_path in FASTQ_PATHS {
113 let full_path = format!("{}{}", prefix, fastq_path);
114
115 if let Ok(dataset) = hdf5.dataset(&full_path) {
116 if let Ok(strings) = dataset.read_string() {
120 if let Some(fastq_str) = strings.first() {
121 return Self::parse_fastq_string(fastq_str);
122 }
123 } else if let Ok(bytes) = dataset.read_u8() {
124 let fastq_str = String::from_utf8(bytes)
126 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
127 return Self::parse_fastq_string(&fastq_str);
128 }
129 }
130 }
131
132 Err(io::Error::new(
133 io::ErrorKind::NotFound,
134 format!("No valid FASTQ data found at prefix '{}'", prefix),
135 ))
136 }
137
138 fn parse_fastq_string(fastq: &str) -> io::Result<Sequence> {
142 let lines: Vec<&str> = fastq.split('\n').collect();
143
144 if lines.len() < 4 {
145 return Err(io::Error::new(
146 io::ErrorKind::InvalidData,
147 format!(
148 "Didn't get 4 sections from FASTQ string (got {})",
149 lines.len()
150 ),
151 ));
152 }
153
154 let id = lines[0].to_string();
155 let seq_bytes = lines[1].as_bytes().to_vec();
156 let qual_bytes = lines[3].as_bytes().to_vec();
157
158 Ok(Sequence::new(id, seq_bytes, qual_bytes))
160 }
161}
162
163impl SequenceFile for Fast5File {
164 fn next(&mut self) -> Option<io::Result<Sequence>> {
165 if self.current_index >= self.sequences.len() {
166 return None;
167 }
168
169 let mut seq = Sequence::new(String::new(), Vec::new(), Vec::new());
172 std::mem::swap(&mut seq, &mut self.sequences[self.current_index]);
173 self.current_index += 1;
174 Some(Ok(seq))
175 }
176
177 fn name(&self) -> &str {
178 &self.name
179 }
180
181 fn is_colorspace(&self) -> bool {
182 false
183 }
184
185 fn percent_complete(&self) -> f64 {
186 if self.current_index >= self.sequences.len() {
187 return 100.0;
188 }
189 (self.current_index as f64 * 100.0) / self.sequences.len() as f64
191 }
192}