Skip to main content

gtars_refget/store/
export.rs

1//! FASTA export and BED region extraction for RefgetStore.
2
3use super::*;
4use super::readonly::ReadonlyRefgetStore;
5
6use std::collections::HashMap;
7use std::ffi::OsStr;
8
9use indexmap::IndexMap;
10use std::fs::File;
11use std::io::{BufWriter, Write};
12use std::path::Path;
13
14use anyhow::{anyhow, Context, Result};
15use flate2::Compression;
16use flate2::write::GzEncoder;
17use flate2::read::MultiGzDecoder;
18use gtars_core::utils::get_file_info;
19
20use crate::digest::{
21    SequenceMetadata, SequenceRecord,
22    decode_substring_from_bytes, lookup_alphabet,
23};
24use crate::hashkeyable::{DigestKey, HashKeyable};
25
26
27// ============================================================================
28// Free function
29// ============================================================================
30
31/// Helper function to decode a sequence record and write it as a FASTA entry.
32///
33/// Handles decoding (encoded or raw storage modes), header formatting (with optional
34/// description), and line-wrapped sequence output.
35pub(crate) fn write_fasta_record(
36    writer: &mut dyn Write,
37    metadata: &SequenceMetadata,
38    sequence_data: &[u8],
39    mode: StorageMode,
40    line_width: usize,
41) -> Result<()> {
42    let decoded_sequence = match mode {
43        StorageMode::Encoded => {
44            let alphabet = lookup_alphabet(&metadata.alphabet);
45            let decoded =
46                decode_substring_from_bytes(sequence_data, 0, metadata.length, alphabet);
47            String::from_utf8(decoded).context("Failed to decode sequence as UTF-8")?
48        }
49        StorageMode::Raw => String::from_utf8(sequence_data.to_vec())
50            .context("Failed to decode raw sequence as UTF-8")?,
51    };
52
53    let header = match &metadata.description {
54        Some(desc) => format!(">{} {}", metadata.name, desc),
55        None => format!(">{}", metadata.name),
56    };
57    writeln!(writer, "{}", header)?;
58
59    for chunk in decoded_sequence.as_bytes().chunks(line_width) {
60        writer.write_all(chunk)?;
61        writer.write_all(b"\n")?;
62    }
63
64    Ok(())
65}
66
67// ============================================================================
68// SubstringsFromRegions Iterator impl
69// ============================================================================
70
71impl<K> Iterator for SubstringsFromRegions<'_, K>
72where
73    K: AsRef<[u8]>,
74{
75    type Item = Result<RetrievedSequence, anyhow::Error>;
76
77    fn next(&mut self) -> Option<Self::Item> {
78        use gtars_core::utils::parse_bedlike_file;
79
80        let mut line_string = String::new();
81
82        let num_bytes = self.reader.read_line(&mut line_string);
83        match num_bytes {
84            Ok(bytes) => {
85                if bytes == 0 {
86                    return None;
87                }
88            }
89            Err(err) => return Some(Err(err.into())),
90        };
91
92        self.line_num += 1;
93
94        let (parsed_chr, parsed_start, parsed_end) = match parse_bedlike_file(line_string.trim()) {
95            Some(coords) => coords,
96            None => {
97                let err_str = format!(
98                    "Error reading line {} because it could not be parsed as a BED-like entry: '{}'",
99                    self.line_num + 1,
100                    line_string
101                );
102                return Some(Err(anyhow!(err_str)));
103            }
104        };
105
106        if parsed_start < 0 || parsed_end < 0 {
107            let err_str = format!(
108                "Error reading line {} due to invalid start or end coordinates: '{}'",
109                self.line_num + 1,
110                line_string
111            );
112            return Some(Err(anyhow!(err_str)));
113        }
114
115        if self.previous_parsed_chr != parsed_chr {
116            self.previous_parsed_chr = parsed_chr.clone();
117
118            let result = match self
119                .store
120                .get_sequence_by_name(&self.collection_digest, &parsed_chr)
121            {
122                Ok(seq_record) => seq_record,
123                Err(e) => {
124                    let err_str = format!(
125                        "Line {}: sequence '{}' not found in collection '{}': {}",
126                        self.line_num + 1,
127                        parsed_chr,
128                        String::from_utf8_lossy(self.collection_digest.as_ref()),
129                        e
130                    );
131                    return Some(Err(anyhow!(err_str)));
132                }
133            };
134
135            self.current_seq_digest = result.metadata().sha512t24u.clone();
136        }
137
138        let retrieved_substring = match self.store.get_substring(
139            &self.current_seq_digest,
140            parsed_start as usize,
141            parsed_end as usize,
142        ) {
143            Ok(substring) => substring,
144            Err(e) => {
145                let err_str = format!(
146                    "Line {}: failed to get substring for digest '{}' from {} to {}: {}",
147                    self.line_num + 1,
148                    self.current_seq_digest,
149                    parsed_start,
150                    parsed_end,
151                    e
152                );
153                return Some(Err(anyhow!(err_str)));
154            }
155        };
156
157        Some(Ok(RetrievedSequence {
158            sequence: retrieved_substring,
159            chrom_name: parsed_chr,
160            start: parsed_start as u32,
161            end: parsed_end as u32,
162        }))
163    }
164}
165
166// ============================================================================
167// ReadonlyRefgetStore export methods
168// ============================================================================
169
170impl ReadonlyRefgetStore {
171    /// Get an iterator over substrings defined by BED file regions.
172    pub fn substrings_from_regions<'a, K: AsRef<[u8]>>(
173        &'a self,
174        collection_digest: K,
175        bed_file_path: &str,
176    ) -> Result<SubstringsFromRegions<'a, K>> {
177        let path = Path::new(bed_file_path);
178        let file_info = get_file_info(path);
179        let is_gzipped = file_info.is_gzipped;
180
181        let opened_bed_file = File::open(path)?;
182
183        let reader: Box<dyn std::io::Read> = match is_gzipped {
184            true => Box::new(MultiGzDecoder::new(std::io::BufReader::new(opened_bed_file))),
185            false => Box::new(opened_bed_file),
186        };
187        let reader = std::io::BufReader::new(reader);
188
189        Ok(SubstringsFromRegions {
190            store: self,
191            reader,
192            collection_digest,
193            previous_parsed_chr: String::new(),
194            current_seq_digest: String::new(),
195            line_num: 0,
196        })
197    }
198
199    /// Export sequences from BED file regions to a FASTA file.
200    pub fn export_fasta_from_regions<K: AsRef<[u8]>>(
201        &self,
202        collection_digest: K,
203        bed_file_path: &str,
204        output_file_path: &str,
205    ) -> Result<()> {
206        let output_path_obj = Path::new(output_file_path);
207        if let Some(parent) = output_path_obj.parent() {
208            std::fs::create_dir_all(parent)?;
209        }
210
211        let file = File::create(output_file_path)?;
212
213        let mut writer: Box<dyn Write> = if output_path_obj.extension() == Some(OsStr::new("gz")) {
214            Box::new(GzEncoder::new(file, Compression::default()))
215        } else {
216            Box::new(file)
217        };
218
219        let collection_key = collection_digest.as_ref().to_key();
220
221        let name_to_metadata: HashMap<String, SequenceMetadata> = self
222            .name_lookup
223            .get(&collection_key)
224            .map(|name_map| {
225                name_map
226                    .iter()
227                    .filter_map(|(name, seq_digest)| {
228                        self.sequence_store.get(seq_digest).map(|record| {
229                            (name.clone(), record.metadata().clone())
230                        })
231                    })
232                    .collect()
233            })
234            .unwrap_or_default();
235
236        let seq_iter = self.substrings_from_regions(&collection_digest, bed_file_path)?;
237
238        let mut previous_parsed_chr = String::new();
239        let mut current_header: String = String::new();
240        let mut previous_header: String = String::new();
241
242        for rs in seq_iter.into_iter() {
243            let rs = rs?;
244
245            if previous_parsed_chr != rs.chrom_name {
246                previous_parsed_chr = rs.chrom_name.clone();
247
248                if let Some(meta) = name_to_metadata.get(&rs.chrom_name) {
249                    current_header =
250                        format!(">{} {} {} {} {}", meta.name, meta.length, meta.alphabet, meta.sha512t24u, meta.md5);
251                }
252            }
253
254            let retrieved_substring = rs.sequence;
255
256            if previous_header != current_header {
257                let prefix = if previous_header.is_empty() { "" } else { "\n" };
258
259                previous_header = current_header.clone();
260
261                let header_to_be_written = format!("{}{}\n", prefix, current_header);
262                writer.write_all(header_to_be_written.as_bytes())?;
263            }
264
265            writer.write_all(retrieved_substring.as_ref())?;
266        }
267
268        writer.flush()?;
269
270        Ok(())
271    }
272
273    /// Export sequences from a collection to a FASTA file.
274    pub fn export_fasta<K: AsRef<[u8]>, P: AsRef<Path>>(
275        &self,
276        collection_digest: K,
277        output_path: P,
278        sequence_names: Option<Vec<&str>>,
279        line_width: Option<usize>,
280    ) -> Result<()> {
281        let line_width = line_width.unwrap_or(80);
282        let output_path = output_path.as_ref();
283        let collection_key = collection_digest.as_ref().to_key();
284
285        let name_to_digest: IndexMap<String, DigestKey> = self
286            .name_lookup
287            .get(&collection_key)
288            .ok_or_else(|| {
289                anyhow!(
290                    "Collection not found: {:?}",
291                    String::from_utf8_lossy(collection_digest.as_ref())
292                )
293            })?
294            .clone();
295
296        let names_to_export: Vec<String> = if let Some(names) = sequence_names {
297            names.iter().map(|s| s.to_string()).collect()
298        } else {
299            name_to_digest.keys().cloned().collect()
300        };
301
302        let file = File::create(output_path).context(format!(
303            "Failed to create output file: {}",
304            output_path.display()
305        ))?;
306
307        let mut writer: Box<dyn Write> = if output_path.extension() == Some(OsStr::new("gz")) {
308            Box::new(GzEncoder::new(BufWriter::new(file), Compression::default()))
309        } else {
310            Box::new(BufWriter::new(file))
311        };
312
313        for seq_name in names_to_export {
314            let seq_digest = name_to_digest
315                .get(&seq_name)
316                .ok_or_else(|| anyhow!("Sequence '{}' not found in collection", seq_name))?;
317
318            let record = self
319                .sequence_store
320                .get(seq_digest)
321                .ok_or_else(|| anyhow!("Sequence record not found for digest: {:?}", seq_digest))?;
322
323            let (metadata, sequence_data) = match record {
324                SequenceRecord::Stub(_) => {
325                    return Err(anyhow!("Sequence data not loaded for '{}'. Call load_sequence() or load_all_sequences() first.", seq_name));
326                }
327                SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
328            };
329
330            write_fasta_record(&mut *writer, metadata, sequence_data, self.mode, line_width)?;
331        }
332
333        writer.flush()?;
334
335        Ok(())
336    }
337
338    /// Export sequences by their sequence digests to a FASTA file.
339    pub fn export_fasta_by_digests<P: AsRef<Path>>(
340        &self,
341        seq_digests: Vec<&str>,
342        output_path: P,
343        line_width: Option<usize>,
344    ) -> Result<()> {
345        let line_width = line_width.unwrap_or(80);
346        let output_path = output_path.as_ref();
347
348        let file = File::create(output_path).context(format!(
349            "Failed to create output file: {}",
350            output_path.display()
351        ))?;
352
353        let mut writer: Box<dyn Write> = if output_path.extension() == Some(OsStr::new("gz")) {
354            Box::new(GzEncoder::new(BufWriter::new(file), Compression::default()))
355        } else {
356            Box::new(BufWriter::new(file))
357        };
358
359        for digest_str in seq_digests {
360            let digest_key = digest_str.as_bytes().to_key();
361
362            let record = self
363                .sequence_store
364                .get(&digest_key)
365                .ok_or_else(|| anyhow!("Sequence record not found for digest: {}", digest_str))?;
366
367            let (metadata, sequence_data) = match record {
368                SequenceRecord::Stub(_) => {
369                    return Err(anyhow!(
370                        "Sequence data not loaded for digest: {}",
371                        digest_str
372                    ));
373                }
374                SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
375            };
376
377            write_fasta_record(&mut *writer, metadata, sequence_data, self.mode, line_width)?;
378        }
379
380        writer.flush()?;
381
382        Ok(())
383    }
384}