1use super::*;
4use super::readonly::ReadonlyRefgetStore;
5
6use std::collections::HashMap;
7use std::ffi::OsStr;
8
9use indexmap::IndexMap;
10use std::fs::File;
11use std::io::{BufWriter, Write};
12use std::path::Path;
13
14use anyhow::{anyhow, Context, Result};
15use flate2::Compression;
16use flate2::write::GzEncoder;
17use flate2::read::MultiGzDecoder;
18use gtars_core::utils::get_file_info;
19
20use crate::digest::{
21 SequenceMetadata, SequenceRecord,
22 decode_substring_from_bytes, lookup_alphabet,
23};
24use crate::hashkeyable::{DigestKey, HashKeyable};
25
26
27pub(crate) fn write_fasta_record(
36 writer: &mut dyn Write,
37 metadata: &SequenceMetadata,
38 sequence_data: &[u8],
39 mode: StorageMode,
40 line_width: usize,
41) -> Result<()> {
42 let decoded_sequence = match mode {
43 StorageMode::Encoded => {
44 let alphabet = lookup_alphabet(&metadata.alphabet);
45 let decoded =
46 decode_substring_from_bytes(sequence_data, 0, metadata.length, alphabet);
47 String::from_utf8(decoded).context("Failed to decode sequence as UTF-8")?
48 }
49 StorageMode::Raw => String::from_utf8(sequence_data.to_vec())
50 .context("Failed to decode raw sequence as UTF-8")?,
51 };
52
53 let header = match &metadata.description {
54 Some(desc) => format!(">{} {}", metadata.name, desc),
55 None => format!(">{}", metadata.name),
56 };
57 writeln!(writer, "{}", header)?;
58
59 for chunk in decoded_sequence.as_bytes().chunks(line_width) {
60 writer.write_all(chunk)?;
61 writer.write_all(b"\n")?;
62 }
63
64 Ok(())
65}
66
67impl<K> Iterator for SubstringsFromRegions<'_, K>
72where
73 K: AsRef<[u8]>,
74{
75 type Item = Result<RetrievedSequence, anyhow::Error>;
76
77 fn next(&mut self) -> Option<Self::Item> {
78 use gtars_core::utils::parse_bedlike_file;
79
80 let mut line_string = String::new();
81
82 let num_bytes = self.reader.read_line(&mut line_string);
83 match num_bytes {
84 Ok(bytes) => {
85 if bytes == 0 {
86 return None;
87 }
88 }
89 Err(err) => return Some(Err(err.into())),
90 };
91
92 self.line_num += 1;
93
94 let (parsed_chr, parsed_start, parsed_end) = match parse_bedlike_file(line_string.trim()) {
95 Some(coords) => coords,
96 None => {
97 let err_str = format!(
98 "Error reading line {} because it could not be parsed as a BED-like entry: '{}'",
99 self.line_num + 1,
100 line_string
101 );
102 return Some(Err(anyhow!(err_str)));
103 }
104 };
105
106 if parsed_start < 0 || parsed_end < 0 {
107 let err_str = format!(
108 "Error reading line {} due to invalid start or end coordinates: '{}'",
109 self.line_num + 1,
110 line_string
111 );
112 return Some(Err(anyhow!(err_str)));
113 }
114
115 if self.previous_parsed_chr != parsed_chr {
116 self.previous_parsed_chr = parsed_chr.clone();
117
118 let result = match self
119 .store
120 .get_sequence_by_name(&self.collection_digest, &parsed_chr)
121 {
122 Ok(seq_record) => seq_record,
123 Err(e) => {
124 let err_str = format!(
125 "Line {}: sequence '{}' not found in collection '{}': {}",
126 self.line_num + 1,
127 parsed_chr,
128 String::from_utf8_lossy(self.collection_digest.as_ref()),
129 e
130 );
131 return Some(Err(anyhow!(err_str)));
132 }
133 };
134
135 self.current_seq_digest = result.metadata().sha512t24u.clone();
136 }
137
138 let retrieved_substring = match self.store.get_substring(
139 &self.current_seq_digest,
140 parsed_start as usize,
141 parsed_end as usize,
142 ) {
143 Ok(substring) => substring,
144 Err(e) => {
145 let err_str = format!(
146 "Line {}: failed to get substring for digest '{}' from {} to {}: {}",
147 self.line_num + 1,
148 self.current_seq_digest,
149 parsed_start,
150 parsed_end,
151 e
152 );
153 return Some(Err(anyhow!(err_str)));
154 }
155 };
156
157 Some(Ok(RetrievedSequence {
158 sequence: retrieved_substring,
159 chrom_name: parsed_chr,
160 start: parsed_start as u32,
161 end: parsed_end as u32,
162 }))
163 }
164}
165
166impl ReadonlyRefgetStore {
171 pub fn substrings_from_regions<'a, K: AsRef<[u8]>>(
173 &'a self,
174 collection_digest: K,
175 bed_file_path: &str,
176 ) -> Result<SubstringsFromRegions<'a, K>> {
177 let path = Path::new(bed_file_path);
178 let file_info = get_file_info(path);
179 let is_gzipped = file_info.is_gzipped;
180
181 let opened_bed_file = File::open(path)?;
182
183 let reader: Box<dyn std::io::Read> = match is_gzipped {
184 true => Box::new(MultiGzDecoder::new(std::io::BufReader::new(opened_bed_file))),
185 false => Box::new(opened_bed_file),
186 };
187 let reader = std::io::BufReader::new(reader);
188
189 Ok(SubstringsFromRegions {
190 store: self,
191 reader,
192 collection_digest,
193 previous_parsed_chr: String::new(),
194 current_seq_digest: String::new(),
195 line_num: 0,
196 })
197 }
198
199 pub fn export_fasta_from_regions<K: AsRef<[u8]>>(
201 &self,
202 collection_digest: K,
203 bed_file_path: &str,
204 output_file_path: &str,
205 ) -> Result<()> {
206 let output_path_obj = Path::new(output_file_path);
207 if let Some(parent) = output_path_obj.parent() {
208 std::fs::create_dir_all(parent)?;
209 }
210
211 let file = File::create(output_file_path)?;
212
213 let mut writer: Box<dyn Write> = if output_path_obj.extension() == Some(OsStr::new("gz")) {
214 Box::new(GzEncoder::new(file, Compression::default()))
215 } else {
216 Box::new(file)
217 };
218
219 let collection_key = collection_digest.as_ref().to_key();
220
221 let name_to_metadata: HashMap<String, SequenceMetadata> = self
222 .name_lookup
223 .get(&collection_key)
224 .map(|name_map| {
225 name_map
226 .iter()
227 .filter_map(|(name, seq_digest)| {
228 self.sequence_store.get(seq_digest).map(|record| {
229 (name.clone(), record.metadata().clone())
230 })
231 })
232 .collect()
233 })
234 .unwrap_or_default();
235
236 let seq_iter = self.substrings_from_regions(&collection_digest, bed_file_path)?;
237
238 let mut previous_parsed_chr = String::new();
239 let mut current_header: String = String::new();
240 let mut previous_header: String = String::new();
241
242 for rs in seq_iter.into_iter() {
243 let rs = rs?;
244
245 if previous_parsed_chr != rs.chrom_name {
246 previous_parsed_chr = rs.chrom_name.clone();
247
248 if let Some(meta) = name_to_metadata.get(&rs.chrom_name) {
249 current_header =
250 format!(">{} {} {} {} {}", meta.name, meta.length, meta.alphabet, meta.sha512t24u, meta.md5);
251 }
252 }
253
254 let retrieved_substring = rs.sequence;
255
256 if previous_header != current_header {
257 let prefix = if previous_header.is_empty() { "" } else { "\n" };
258
259 previous_header = current_header.clone();
260
261 let header_to_be_written = format!("{}{}\n", prefix, current_header);
262 writer.write_all(header_to_be_written.as_bytes())?;
263 }
264
265 writer.write_all(retrieved_substring.as_ref())?;
266 }
267
268 writer.flush()?;
269
270 Ok(())
271 }
272
273 pub fn export_fasta<K: AsRef<[u8]>, P: AsRef<Path>>(
275 &self,
276 collection_digest: K,
277 output_path: P,
278 sequence_names: Option<Vec<&str>>,
279 line_width: Option<usize>,
280 ) -> Result<()> {
281 let line_width = line_width.unwrap_or(80);
282 let output_path = output_path.as_ref();
283 let collection_key = collection_digest.as_ref().to_key();
284
285 let name_to_digest: IndexMap<String, DigestKey> = self
286 .name_lookup
287 .get(&collection_key)
288 .ok_or_else(|| {
289 anyhow!(
290 "Collection not found: {:?}",
291 String::from_utf8_lossy(collection_digest.as_ref())
292 )
293 })?
294 .clone();
295
296 let names_to_export: Vec<String> = if let Some(names) = sequence_names {
297 names.iter().map(|s| s.to_string()).collect()
298 } else {
299 name_to_digest.keys().cloned().collect()
300 };
301
302 let file = File::create(output_path).context(format!(
303 "Failed to create output file: {}",
304 output_path.display()
305 ))?;
306
307 let mut writer: Box<dyn Write> = if output_path.extension() == Some(OsStr::new("gz")) {
308 Box::new(GzEncoder::new(BufWriter::new(file), Compression::default()))
309 } else {
310 Box::new(BufWriter::new(file))
311 };
312
313 for seq_name in names_to_export {
314 let seq_digest = name_to_digest
315 .get(&seq_name)
316 .ok_or_else(|| anyhow!("Sequence '{}' not found in collection", seq_name))?;
317
318 let record = self
319 .sequence_store
320 .get(seq_digest)
321 .ok_or_else(|| anyhow!("Sequence record not found for digest: {:?}", seq_digest))?;
322
323 let (metadata, sequence_data) = match record {
324 SequenceRecord::Stub(_) => {
325 return Err(anyhow!("Sequence data not loaded for '{}'. Call load_sequence() or load_all_sequences() first.", seq_name));
326 }
327 SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
328 };
329
330 write_fasta_record(&mut *writer, metadata, sequence_data, self.mode, line_width)?;
331 }
332
333 writer.flush()?;
334
335 Ok(())
336 }
337
338 pub fn export_fasta_by_digests<P: AsRef<Path>>(
340 &self,
341 seq_digests: Vec<&str>,
342 output_path: P,
343 line_width: Option<usize>,
344 ) -> Result<()> {
345 let line_width = line_width.unwrap_or(80);
346 let output_path = output_path.as_ref();
347
348 let file = File::create(output_path).context(format!(
349 "Failed to create output file: {}",
350 output_path.display()
351 ))?;
352
353 let mut writer: Box<dyn Write> = if output_path.extension() == Some(OsStr::new("gz")) {
354 Box::new(GzEncoder::new(BufWriter::new(file), Compression::default()))
355 } else {
356 Box::new(BufWriter::new(file))
357 };
358
359 for digest_str in seq_digests {
360 let digest_key = digest_str.as_bytes().to_key();
361
362 let record = self
363 .sequence_store
364 .get(&digest_key)
365 .ok_or_else(|| anyhow!("Sequence record not found for digest: {}", digest_str))?;
366
367 let (metadata, sequence_data) = match record {
368 SequenceRecord::Stub(_) => {
369 return Err(anyhow!(
370 "Sequence data not loaded for digest: {}",
371 digest_str
372 ));
373 }
374 SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
375 };
376
377 write_fasta_record(&mut *writer, metadata, sequence_data, self.mode, line_width)?;
378 }
379
380 writer.flush()?;
381
382 Ok(())
383 }
384}