seq_here/
extract.rs

1use std::collections::{HashMap, HashSet};
2use std::fs::File;
3use std::{fs, io};
4use std::io::BufRead;
5use crate::error::{e_exit, e_println};
6use crate::utils::{FileType, MultiFormatWriter};
7use bio::io::gff::GffType;
8use bio::io::{fasta, fastq, gff};
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11use rayon::iter::IntoParallelRefIterator;
12use rayon::iter::ParallelIterator;
13
14/// Extract specific segments from biological sequence files
15pub struct ExtractSegment;
16
17impl ExtractSegment {
18    /// Extract sequences that match a single ID
19    /// 
20    /// # Arguments
21    /// * `paths` - Input sequence files (FASTA, FASTQ, GFF)
22    /// * `id` - Sequence identifier to extract
23    /// * `output` - Output file path
24    /// * `start` - Optional start position (0-based) for the extracted segment
25    /// * `end` - Optional end position (0-based, exclusive) for the extracted segment
26    pub fn extract_id(paths: Vec<PathBuf>, id: String, output: PathBuf, start: Option<usize>, end: Option<usize>) {
27        let id_set = vec![Self::normalize_id(&id)].into_iter().collect();
28        Self::process_files_parallel(paths, &id_set, &output, start, end)
29    }
30
31    /// Extract sequences matching IDs from a file
32    /// 
33    /// # Arguments
34    /// * `paths` - Input sequence files (FASTA, FASTQ, GFF)
35    /// * `id_file` - File containing IDs to extract (one per line)
36    /// * `output` - Output file path
37    /// * `start` - Optional start position (0-based) for the extracted segment
38    /// * `end` - Optional end position (0-based, exclusive) for the extracted segment
39    pub fn extract_id_files(paths: Vec<PathBuf>, id_file: PathBuf, output: PathBuf, start: Option<usize>, end: Option<usize>) {
40        let id_set = match Self::load_id_set(&id_file) {
41            Ok(set) => set,
42            Err(e) => e_exit("ID-LOAD", &format!("Failed to load IDs: {}", e), 1),
43        };
44        Self::process_files_parallel(paths, &id_set, &output, start, end)
45    }
46
47    /// Process multiple files in parallel
48    fn process_files_parallel(paths: Vec<PathBuf>, id_set: &HashSet<String>, output: &PathBuf, start: Option<usize>, end: Option<usize>) {
49        let writer = match MultiFormatWriter::new(output) {
50            Ok(w) => Arc::new(Mutex::new(w)),
51            Err(e) => e_exit("WRITER", &format!("Output init failed: {}", e), 2),
52        };
53
54        paths.par_iter().for_each(|path| {
55            let writer = Arc::clone(&writer);
56            match FileType::infer_file_type(path) {
57                FileType::Fasta => Self::process_file(path, id_set, writer, |p, ids, w| Self::process_fasta(p, ids, w, start, end)),
58                FileType::Gff => Self::process_file(path, id_set, writer, |p, ids, w| Self::process_gff(p, ids, w)),
59                FileType::Fastq => Self::process_file(path, id_set, writer, |p, ids, w| Self::process_fastq(p, ids, w)),
60                FileType::Unknown => e_println("TYPE-ERROR", &format!("Unsupported format: {:?}", path)),
61            };
62        });
63    }
64
65    /// Process a single file with the appropriate processor function
66    fn process_file<P>(
67        path: &PathBuf,
68        ids: &HashSet<String>,
69        writer: Arc<Mutex<MultiFormatWriter>>,
70        processor: P,
71    ) where
72        P: Fn(&PathBuf, &HashSet<String>, &mut MultiFormatWriter),
73    {
74        let mut writer = writer.lock().unwrap_or_else(|e| {
75            e_exit("LOCK-ERROR", &format!("Writer lock failed: {}", e), 3)
76        });
77
78        processor(path, ids, &mut writer);
79    }
80
81    /// Load sequence IDs from a file into a HashSet
82    fn load_id_set(path: &PathBuf) -> io::Result<HashSet<String>> {
83        let file = File::open(path).map_err(|e| {
84            e_println("FILE-ERROR", &format!("Open {} failed: {}", path.display(), e));
85            e
86        })?;
87
88        let mut set = HashSet::new();
89        for line in io::BufReader::new(file).lines() {
90            let raw_id = line?.trim().to_string();
91            if !raw_id.is_empty() {
92                set.insert(Self::normalize_id(&raw_id));
93            }
94        }
95        Ok(set)
96    }
97
98    /// Normalize sequence identifiers for consistent matching
99    /// 
100    /// Takes the first part before any whitespace, pipe, or semicolon
101    /// and converts to lowercase for case-insensitive matching
102    fn normalize_id(raw_id: &str) -> String {
103        raw_id.split(|c: char| c.is_whitespace() || c == '|' || c == ';')
104            .next()
105            .unwrap_or(raw_id)
106            .to_lowercase()
107    }
108
109    /// Process FASTA format files to extract matching sequences
110    fn process_fasta(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter, start: Option<usize>, end: Option<usize>) {
111        let reader = fasta::Reader::from_file(path)
112            .expect(&format!("Failed to open FASTA file: {}", path.display()));
113            
114        for record in reader.records() {
115            let record = record
116                .expect(&format!("Failed to parse FASTA record in {}", path.display()));
117                
118            if ids.contains(&Self::normalize_id(record.id())) {
119                // Apply start and end positions if specified
120                if start.is_some() || end.is_some() {
121                    let start_pos = start.unwrap_or(0);
122                    let end_pos = end.unwrap_or(record.seq().len());
123                    
124                    // Validate positions
125                    if start_pos >= record.seq().len() {
126                        e_println("RANGE-ERROR", &format!("Start position {} out of range for sequence {} (length {})", 
127                            start_pos, record.id(), record.seq().len()));
128                        continue;
129                    }
130                    
131                    if end_pos > record.seq().len() {
132                        e_println("RANGE-ERROR", &format!("End position {} out of range for sequence {} (length {})", 
133                            end_pos, record.id(), record.seq().len()));
134                        continue;
135                    }
136                    
137                    if start_pos >= end_pos {
138                        e_println("RANGE-ERROR", &format!("Invalid range: start ({}) must be less than end ({})", 
139                            start_pos, end_pos));
140                        continue;
141                    }
142                    
143                    // Create a new record with the extracted segment
144                    let segment_seq = record.seq()[start_pos..end_pos].to_vec();
145                    let description = match record.desc() {
146                        Some(desc) => Some(format!("{} segment:{}..{}", desc, start_pos, end_pos)),
147                        None => Some(format!("segment:{}..{}", start_pos, end_pos)),
148                    };
149                    let segment_record = fasta::Record::with_attrs(record.id(), description.as_deref(), &segment_seq);
150                    
151                    writer.fa.write_record(&segment_record)
152                        .expect(&format!("Failed to write FASTA record segment: {}", record.id()));
153                } else {
154                    // Write the complete record if no positions specified
155                    writer.fa.write_record(&record)
156                        .expect(&format!("Failed to write FASTA record: {}", record.id()));
157                }
158            }
159        }
160    }
161
162    /// Process GFF format files to extract matching annotations
163    fn process_gff(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
164        let mut reader = gff::Reader::from_file(path, GffType::GFF3)
165            .expect(&format!("Failed to open GFF file: {}", path.display()));
166            
167        for record in reader.records() {
168            let record = record
169                .expect(&format!("Failed to parse GFF record in {}", path.display()));
170                
171            if let Some(id) = record.attributes().get("ID") {
172                if ids.contains(&Self::normalize_id(id)) {
173                    writer.gff.write(&record)
174                        .expect(&format!("Failed to write GFF record: {}", id));
175                }
176            }
177        }
178    }
179
180    /// Process FASTQ format files to extract matching sequences
181    fn process_fastq(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
182        let reader = fastq::Reader::from_file(path)
183            .expect(&format!("Failed to open FASTQ file: {}", path.display()));
184            
185        for record in reader.records() {
186            let record = record
187                .expect(&format!("Failed to parse FASTQ record in {}", path.display()));
188                
189            if ids.contains(&Self::normalize_id(record.id())) {
190                writer.fq.write_record(&record)
191                    .expect(&format!("Failed to write FASTQ record: {}", record.id()));
192            }
193        }
194    }
195}
196
197/// Extract and explain annotated sequence features
198pub struct ExtractExplain;
199
200impl ExtractExplain {
201    /// Extract annotated features from sequences
202    /// 
203    /// # Arguments
204    /// * `seq_files` - FASTA files containing sequences
205    /// * `anno_files` - GFF files containing annotations
206    /// * `output` - Output directory for extracted features
207    /// * `feature_types` - Optional set of feature types to extract (e.g., "CDS", "gene")
208    pub fn extract(seq_files: Vec<PathBuf>, anno_files: Vec<PathBuf>, output: PathBuf, feature_types: Option<Vec<String>>) {
209        // Create output directory
210        fs::create_dir_all(&output).unwrap_or_else(|e| {
211            e_exit("FS", &format!("Failed to create output directory: {}", e), 1);
212        });
213
214        // Process each sequence file in parallel
215        seq_files.par_iter().for_each(|seq_path| {
216            // Load sequence data
217            let seq_data = Self::load_sequences(seq_path)
218                .unwrap_or_else(|e| e_exit("SEQ-LOAD", &e, 2));
219
220            // Process all annotation files
221            let annotations: Vec<_> = anno_files.par_iter()
222                .flat_map(|anno_path| {
223                    let anns = Self::load_annotations(anno_path)
224                        .unwrap_or_else(|e| e_exit("ANN-LOAD", &e, 3));
225                    
226                    // Filter annotations by feature type if specified
227                    if let Some(types) = &feature_types {
228                        anns.into_iter()
229                            .filter(|ann| types.iter().any(|t| t.eq_ignore_ascii_case(ann.feature_type())))
230                            .collect::<Vec<_>>()
231                    } else {
232                        anns
233                    }
234                })
235                .collect();
236
237            // Generate result file
238            let output_path = output.join(seq_path.file_name().unwrap());
239            Self::generate_annotated_file(&seq_data, &annotations, &output_path)
240                .unwrap_or_else(|e| e_exit("OUTPUT", &e, 4));
241        });
242    }
243
244    /// Load sequence data into memory (suitable for small to medium files)
245    /// 
246    /// Returns a map of sequence IDs to FASTA records
247    fn load_sequences(path: &Path) -> Result<HashMap<String, fasta::Record>, String> {
248        let reader = fasta::Reader::from_file(path)
249            .map_err(|e| format!("Failed to read sequence file: {} - {}", path.display(), e))?;
250
251        let mut seq_map = HashMap::new();
252        for record in reader.records() {
253            let record = record.map_err(|e| format!("Failed to parse FASTA: {}", e))?;
254            seq_map.insert(record.id().to_string(), record);
255        }
256        Ok(seq_map)
257    }
258
259    /// Load GFF annotations
260    /// 
261    /// Returns a vector of GFF records
262    fn load_annotations(path: &Path) -> Result<Vec<gff::Record>, String> {
263        let mut reader = gff::Reader::from_file(path, GffType::GFF3)
264            .map_err(|e| format!("Failed to read annotation file: {} - {}", path.display(), e))?;
265
266        reader.records()
267            .map(|r| r.map_err(|e| format!("Failed to parse GFF: {}", e)))
268            .collect()
269    }
270
271    /// Generate annotated sequence files
272    /// 
273    /// Extracts sequence segments based on annotations and writes to output file
274    fn generate_annotated_file(
275        seq_data: &HashMap<String, fasta::Record>,
276        annotations: &[gff::Record],
277        output: &Path
278    ) -> Result<(), String> {
279        let mut writer = fasta::Writer::new(File::create(output)
280            .map_err(|e| format!("Failed to create output file: {} - {}", output.display(), e))?);
281
282        // Generate feature sequences for each annotation
283        for ann in annotations {
284            let seq_id = ann.seqname();
285            let Some(seq) = seq_data.get(seq_id) else {
286                e_println("ANN-SKIP", &format!("Sequence not found: {}", seq_id));
287                continue;
288            };
289
290            // Extract sequence for the annotated region
291            let feature_seq = Self::extract_feature(seq, ann)
292                .map_err(|e| format!("Failed to extract feature: {}", e))?;
293
294            // Generate description
295            let description = format!("{}:{}-{} {}",
296                                      ann.feature_type(),
297                                      ann.start(),
298                                      ann.end(),
299                                      ann.attributes().get("ID").unwrap_or(&"unknown".to_string())
300            );
301
302            // Write new record
303            let new_record = fasta::Record::with_attrs(
304                ann.attributes().get("ID").unwrap_or(&ann.seqname().to_string()),
305                Some(&description),
306                &feature_seq
307            );
308            writer.write_record(&new_record)
309                .map_err(|e| format!("Write failed: {}", e))?;
310        }
311        Ok(())
312    }
313
314    /// Extract sequence segment for a feature
315    /// 
316    /// Extracts the subsequence corresponding to the annotation coordinates
317    fn extract_feature(seq: &fasta::Record, ann: &gff::Record) -> Result<Vec<u8>, String> {
318        // Convert from 1-based GFF coordinates to 0-based index
319        let start = ann.start().saturating_sub(1).to_owned();
320        let end = ann.end().to_owned();
321
322        // Validate coordinates
323        if start >= seq.seq().len() as u64 || end > seq.seq().len() as u64 {
324            return Err(format!("Invalid range: {}-{} (sequence length: {})",
325                               ann.start(), ann.end(), seq.seq().len()));
326        }
327
328        Ok(seq.seq()[start as usize..end as usize].to_vec())
329    }
330}