seq_here/
extract.rs

1use std::collections::{HashMap, HashSet};
2use std::fs::File;
3use std::{fs, io};
4use std::io::BufRead;
5use crate::error::{e_exit, e_println};
6use crate::utils::{FileType, MultiFormatWriter};
7use bio::io::gff::GffType;
8use bio::io::{fasta, fastq, gff};
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11use rayon::iter::IntoParallelRefIterator;
12use rayon::iter::ParallelIterator;
13
14/// Extract specific segments from biological sequence files
15pub struct ExtractSegment;
16
17impl ExtractSegment {
18    /// Extract sequences that match a single ID
19    /// 
20    /// # Arguments
21    /// * `paths` - Input sequence files (FASTA, FASTQ, GFF)
22    /// * `id` - Sequence identifier to extract
23    /// * `output` - Output file path
24    pub fn extract_id(paths: Vec<PathBuf>, id: String, output: PathBuf) {
25        let id_set = vec![Self::normalize_id(&id)].into_iter().collect();
26        Self::process_files_parallel(paths, &id_set, &output)
27    }
28
29    /// Extract sequences matching IDs from a file
30    /// 
31    /// # Arguments
32    /// * `paths` - Input sequence files (FASTA, FASTQ, GFF)
33    /// * `id_file` - File containing IDs to extract (one per line)
34    /// * `output` - Output file path
35    pub fn extract_id_files(paths: Vec<PathBuf>, id_file: PathBuf, output: PathBuf) {
36        let id_set = match Self::load_id_set(&id_file) {
37            Ok(set) => set,
38            Err(e) => e_exit("ID-LOAD", &format!("Failed to load IDs: {}", e), 1),
39        };
40        Self::process_files_parallel(paths, &id_set, &output)
41    }
42
43    /// Process multiple files in parallel
44    fn process_files_parallel(paths: Vec<PathBuf>, id_set: &HashSet<String>, output: &PathBuf) {
45        let writer = match MultiFormatWriter::new(output) {
46            Ok(w) => Arc::new(Mutex::new(w)),
47            Err(e) => e_exit("WRITER", &format!("Output init failed: {}", e), 2),
48        };
49
50        paths.par_iter().for_each(|path| {
51            let writer = Arc::clone(&writer);
52            match FileType::infer_file_type(path) {
53                FileType::Fasta => Self::process_file(path, id_set, writer, Self::process_fasta),
54                FileType::Gff => Self::process_file(path, id_set, writer, Self::process_gff),
55                FileType::Fastq => Self::process_file(path, id_set, writer, Self::process_fastq),
56                FileType::Unknown => e_println("TYPE-ERROR", &format!("Unsupported format: {:?}", path)),
57            };
58        });
59    }
60
61    /// Process a single file with the appropriate processor function
62    fn process_file<P>(
63        path: &PathBuf,
64        ids: &HashSet<String>,
65        writer: Arc<Mutex<MultiFormatWriter>>,
66        processor: P,
67    ) where
68        P: Fn(&PathBuf, &HashSet<String>, &mut MultiFormatWriter),
69    {
70        let mut writer = writer.lock().unwrap_or_else(|e| {
71            e_exit("LOCK-ERROR", &format!("Writer lock failed: {}", e), 3)
72        });
73
74        processor(path, ids, &mut writer);
75    }
76
77    /// Load sequence IDs from a file into a HashSet
78    fn load_id_set(path: &PathBuf) -> io::Result<HashSet<String>> {
79        let file = File::open(path).map_err(|e| {
80            e_println("FILE-ERROR", &format!("Open {} failed: {}", path.display(), e));
81            e
82        })?;
83
84        let mut set = HashSet::new();
85        for line in io::BufReader::new(file).lines() {
86            let raw_id = line?.trim().to_string();
87            if !raw_id.is_empty() {
88                set.insert(Self::normalize_id(&raw_id));
89            }
90        }
91        Ok(set)
92    }
93
94    /// Normalize sequence identifiers for consistent matching
95    /// 
96    /// Takes the first part before any whitespace, pipe, or semicolon
97    /// and converts to lowercase for case-insensitive matching
98    fn normalize_id(raw_id: &str) -> String {
99        raw_id.split(|c: char| c.is_whitespace() || c == '|' || c == ';')
100            .next()
101            .unwrap_or(raw_id)
102            .to_lowercase()
103    }
104
105    /// Process FASTA format files to extract matching sequences
106    fn process_fasta(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
107        let reader = fasta::Reader::from_file(path)
108            .expect(&format!("Failed to open FASTA file: {}", path.display()));
109            
110        for record in reader.records() {
111            let record = record
112                .expect(&format!("Failed to parse FASTA record in {}", path.display()));
113                
114            if ids.contains(&Self::normalize_id(record.id())) {
115                writer.fa.write_record(&record)
116                    .expect(&format!("Failed to write FASTA record: {}", record.id()));
117            }
118        }
119    }
120
121    /// Process GFF format files to extract matching annotations
122    fn process_gff(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
123        let mut reader = gff::Reader::from_file(path, GffType::GFF3)
124            .expect(&format!("Failed to open GFF file: {}", path.display()));
125            
126        for record in reader.records() {
127            let record = record
128                .expect(&format!("Failed to parse GFF record in {}", path.display()));
129                
130            if let Some(id) = record.attributes().get("ID") {
131                if ids.contains(&Self::normalize_id(id)) {
132                    writer.gff.write(&record)
133                        .expect(&format!("Failed to write GFF record: {}", id));
134                }
135            }
136        }
137    }
138
139    /// Process FASTQ format files to extract matching sequences
140    fn process_fastq(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
141        let reader = fastq::Reader::from_file(path)
142            .expect(&format!("Failed to open FASTQ file: {}", path.display()));
143            
144        for record in reader.records() {
145            let record = record
146                .expect(&format!("Failed to parse FASTQ record in {}", path.display()));
147                
148            if ids.contains(&Self::normalize_id(record.id())) {
149                writer.fq.write_record(&record)
150                    .expect(&format!("Failed to write FASTQ record: {}", record.id()));
151            }
152        }
153    }
154}
155
156/// Extract and explain annotated sequence features
157pub struct ExtractExplain;
158
159impl ExtractExplain {
160    /// Extract annotated features from sequences
161    /// 
162    /// # Arguments
163    /// * `seq_files` - FASTA files containing sequences
164    /// * `anno_files` - GFF files containing annotations
165    /// * `output` - Output directory for extracted features
166    pub fn extract(seq_files: Vec<PathBuf>, anno_files: Vec<PathBuf>, output: PathBuf) {
167        // Create output directory
168        fs::create_dir_all(&output).unwrap_or_else(|e| {
169            e_exit("FS", &format!("Failed to create output directory: {}", e), 1);
170        });
171
172        // Process each sequence file in parallel
173        seq_files.par_iter().for_each(|seq_path| {
174            // Load sequence data
175            let seq_data = Self::load_sequences(seq_path)
176                .unwrap_or_else(|e| e_exit("SEQ-LOAD", &e, 2));
177
178            // Process all annotation files
179            let annotations: Vec<_> = anno_files.par_iter()
180                .flat_map(|anno_path| {
181                    Self::load_annotations(anno_path)
182                        .unwrap_or_else(|e| e_exit("ANN-LOAD", &e, 3))
183                })
184                .collect();
185
186            // Generate result file
187            let output_path = output.join(seq_path.file_name().unwrap());
188            Self::generate_annotated_file(&seq_data, &annotations, &output_path)
189                .unwrap_or_else(|e| e_exit("OUTPUT", &e, 4));
190        });
191    }
192
193    /// Load sequence data into memory (suitable for small to medium files)
194    /// 
195    /// Returns a map of sequence IDs to FASTA records
196    fn load_sequences(path: &Path) -> Result<HashMap<String, fasta::Record>, String> {
197        let reader = fasta::Reader::from_file(path)
198            .map_err(|e| format!("Failed to read sequence file: {} - {}", path.display(), e))?;
199
200        let mut seq_map = HashMap::new();
201        for record in reader.records() {
202            let record = record.map_err(|e| format!("Failed to parse FASTA: {}", e))?;
203            seq_map.insert(record.id().to_string(), record);
204        }
205        Ok(seq_map)
206    }
207
208    /// Load GFF annotations
209    /// 
210    /// Returns a vector of GFF records
211    fn load_annotations(path: &Path) -> Result<Vec<gff::Record>, String> {
212        let mut reader = gff::Reader::from_file(path, GffType::GFF3)
213            .map_err(|e| format!("Failed to read annotation file: {} - {}", path.display(), e))?;
214
215        reader.records()
216            .map(|r| r.map_err(|e| format!("Failed to parse GFF: {}", e)))
217            .collect()
218    }
219
220    /// Generate annotated sequence files
221    /// 
222    /// Extracts sequence segments based on annotations and writes to output file
223    fn generate_annotated_file(
224        seq_data: &HashMap<String, fasta::Record>,
225        annotations: &[gff::Record],
226        output: &Path
227    ) -> Result<(), String> {
228        let mut writer = fasta::Writer::new(File::create(output)
229            .map_err(|e| format!("Failed to create output file: {} - {}", output.display(), e))?);
230
231        // Generate feature sequences for each annotation
232        for ann in annotations {
233            let seq_id = ann.seqname();
234            let Some(seq) = seq_data.get(seq_id) else {
235                e_println("ANN-SKIP", &format!("Sequence not found: {}", seq_id));
236                continue;
237            };
238
239            // Extract sequence for the annotated region
240            let feature_seq = Self::extract_feature(seq, ann)
241                .map_err(|e| format!("Failed to extract feature: {}", e))?;
242
243            // Generate description
244            let description = format!("{}:{}-{} {}",
245                                      ann.feature_type(),
246                                      ann.start(),
247                                      ann.end(),
248                                      ann.attributes().get("ID").unwrap_or(&"unknown".to_string())
249            );
250
251            // Write new record
252            let new_record = fasta::Record::with_attrs(
253                ann.attributes().get("ID").unwrap_or(&ann.seqname().to_string()),
254                Some(&description),
255                &feature_seq
256            );
257            writer.write_record(&new_record)
258                .map_err(|e| format!("Write failed: {}", e))?;
259        }
260        Ok(())
261    }
262
263    /// Extract sequence segment for a feature
264    /// 
265    /// Extracts the subsequence corresponding to the annotation coordinates
266    fn extract_feature(seq: &fasta::Record, ann: &gff::Record) -> Result<Vec<u8>, String> {
267        // Convert from 1-based GFF coordinates to 0-based index
268        let start = ann.start().saturating_sub(1).to_owned();
269        let end = ann.end().to_owned();
270
271        // Validate coordinates
272        if start >= seq.seq().len() as u64 || end > seq.seq().len() as u64 {
273            return Err(format!("Invalid range: {}-{} (sequence length: {})",
274                               ann.start(), ann.end(), seq.seq().len()));
275        }
276
277        Ok(seq.seq()[start as usize..end as usize].to_vec())
278    }
279}