1use std::collections::{HashMap, HashSet};
2use std::fs::File;
3use std::{fs, io};
4use std::io::BufRead;
5use crate::error::{e_exit, e_println};
6use crate::utils::{FileType, MultiFormatWriter};
7use bio::io::gff::GffType;
8use bio::io::{fasta, fastq, gff};
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11use rayon::iter::IntoParallelRefIterator;
12use rayon::iter::ParallelIterator;
13
14pub struct ExtractSegment;
16
17impl ExtractSegment {
18 pub fn extract_id(paths: Vec<PathBuf>, id: String, output: PathBuf, start: Option<usize>, end: Option<usize>) {
27 let id_set = vec![Self::normalize_id(&id)].into_iter().collect();
28 Self::process_files_parallel(paths, &id_set, &output, start, end)
29 }
30
31 pub fn extract_id_files(paths: Vec<PathBuf>, id_file: PathBuf, output: PathBuf, start: Option<usize>, end: Option<usize>) {
40 let id_set = match Self::load_id_set(&id_file) {
41 Ok(set) => set,
42 Err(e) => e_exit("ID-LOAD", &format!("Failed to load IDs: {}", e), 1),
43 };
44 Self::process_files_parallel(paths, &id_set, &output, start, end)
45 }
46
47 fn process_files_parallel(paths: Vec<PathBuf>, id_set: &HashSet<String>, output: &PathBuf, start: Option<usize>, end: Option<usize>) {
49 let writer = match MultiFormatWriter::new(output) {
50 Ok(w) => Arc::new(Mutex::new(w)),
51 Err(e) => e_exit("WRITER", &format!("Output init failed: {}", e), 2),
52 };
53
54 paths.par_iter().for_each(|path| {
55 let writer = Arc::clone(&writer);
56 match FileType::infer_file_type(path) {
57 FileType::Fasta => Self::process_file(path, id_set, writer, |p, ids, w| Self::process_fasta(p, ids, w, start, end)),
58 FileType::Gff => Self::process_file(path, id_set, writer, |p, ids, w| Self::process_gff(p, ids, w)),
59 FileType::Fastq => Self::process_file(path, id_set, writer, |p, ids, w| Self::process_fastq(p, ids, w)),
60 FileType::Unknown => e_println("TYPE-ERROR", &format!("Unsupported format: {:?}", path)),
61 };
62 });
63 }
64
65 fn process_file<P>(
67 path: &PathBuf,
68 ids: &HashSet<String>,
69 writer: Arc<Mutex<MultiFormatWriter>>,
70 processor: P,
71 ) where
72 P: Fn(&PathBuf, &HashSet<String>, &mut MultiFormatWriter),
73 {
74 let mut writer = writer.lock().unwrap_or_else(|e| {
75 e_exit("LOCK-ERROR", &format!("Writer lock failed: {}", e), 3)
76 });
77
78 processor(path, ids, &mut writer);
79 }
80
81 fn load_id_set(path: &PathBuf) -> io::Result<HashSet<String>> {
83 let file = File::open(path).map_err(|e| {
84 e_println("FILE-ERROR", &format!("Open {} failed: {}", path.display(), e));
85 e
86 })?;
87
88 let mut set = HashSet::new();
89 for line in io::BufReader::new(file).lines() {
90 let raw_id = line?.trim().to_string();
91 if !raw_id.is_empty() {
92 set.insert(Self::normalize_id(&raw_id));
93 }
94 }
95 Ok(set)
96 }
97
98 fn normalize_id(raw_id: &str) -> String {
103 raw_id.split(|c: char| c.is_whitespace() || c == '|' || c == ';')
104 .next()
105 .unwrap_or(raw_id)
106 .to_lowercase()
107 }
108
109 fn process_fasta(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter, start: Option<usize>, end: Option<usize>) {
111 let reader = fasta::Reader::from_file(path)
112 .expect(&format!("Failed to open FASTA file: {}", path.display()));
113
114 for record in reader.records() {
115 let record = record
116 .expect(&format!("Failed to parse FASTA record in {}", path.display()));
117
118 if ids.contains(&Self::normalize_id(record.id())) {
119 if start.is_some() || end.is_some() {
121 let start_pos = start.unwrap_or(0);
122 let end_pos = end.unwrap_or(record.seq().len());
123
124 if start_pos >= record.seq().len() {
126 e_println("RANGE-ERROR", &format!("Start position {} out of range for sequence {} (length {})",
127 start_pos, record.id(), record.seq().len()));
128 continue;
129 }
130
131 if end_pos > record.seq().len() {
132 e_println("RANGE-ERROR", &format!("End position {} out of range for sequence {} (length {})",
133 end_pos, record.id(), record.seq().len()));
134 continue;
135 }
136
137 if start_pos >= end_pos {
138 e_println("RANGE-ERROR", &format!("Invalid range: start ({}) must be less than end ({})",
139 start_pos, end_pos));
140 continue;
141 }
142
143 let segment_seq = record.seq()[start_pos..end_pos].to_vec();
145 let description = match record.desc() {
146 Some(desc) => Some(format!("{} segment:{}..{}", desc, start_pos, end_pos)),
147 None => Some(format!("segment:{}..{}", start_pos, end_pos)),
148 };
149 let segment_record = fasta::Record::with_attrs(record.id(), description.as_deref(), &segment_seq);
150
151 writer.fa.write_record(&segment_record)
152 .expect(&format!("Failed to write FASTA record segment: {}", record.id()));
153 } else {
154 writer.fa.write_record(&record)
156 .expect(&format!("Failed to write FASTA record: {}", record.id()));
157 }
158 }
159 }
160 }
161
162 fn process_gff(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
164 let mut reader = gff::Reader::from_file(path, GffType::GFF3)
165 .expect(&format!("Failed to open GFF file: {}", path.display()));
166
167 for record in reader.records() {
168 let record = record
169 .expect(&format!("Failed to parse GFF record in {}", path.display()));
170
171 if let Some(id) = record.attributes().get("ID") {
172 if ids.contains(&Self::normalize_id(id)) {
173 writer.gff.write(&record)
174 .expect(&format!("Failed to write GFF record: {}", id));
175 }
176 }
177 }
178 }
179
180 fn process_fastq(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
182 let reader = fastq::Reader::from_file(path)
183 .expect(&format!("Failed to open FASTQ file: {}", path.display()));
184
185 for record in reader.records() {
186 let record = record
187 .expect(&format!("Failed to parse FASTQ record in {}", path.display()));
188
189 if ids.contains(&Self::normalize_id(record.id())) {
190 writer.fq.write_record(&record)
191 .expect(&format!("Failed to write FASTQ record: {}", record.id()));
192 }
193 }
194 }
195}
196
197pub struct ExtractExplain;
199
200impl ExtractExplain {
201 pub fn extract(seq_files: Vec<PathBuf>, anno_files: Vec<PathBuf>, output: PathBuf, feature_types: Option<Vec<String>>) {
209 fs::create_dir_all(&output).unwrap_or_else(|e| {
211 e_exit("FS", &format!("Failed to create output directory: {}", e), 1);
212 });
213
214 seq_files.par_iter().for_each(|seq_path| {
216 let seq_data = Self::load_sequences(seq_path)
218 .unwrap_or_else(|e| e_exit("SEQ-LOAD", &e, 2));
219
220 let annotations: Vec<_> = anno_files.par_iter()
222 .flat_map(|anno_path| {
223 let anns = Self::load_annotations(anno_path)
224 .unwrap_or_else(|e| e_exit("ANN-LOAD", &e, 3));
225
226 if let Some(types) = &feature_types {
228 anns.into_iter()
229 .filter(|ann| types.iter().any(|t| t.eq_ignore_ascii_case(ann.feature_type())))
230 .collect::<Vec<_>>()
231 } else {
232 anns
233 }
234 })
235 .collect();
236
237 let output_path = output.join(seq_path.file_name().unwrap());
239 Self::generate_annotated_file(&seq_data, &annotations, &output_path)
240 .unwrap_or_else(|e| e_exit("OUTPUT", &e, 4));
241 });
242 }
243
244 fn load_sequences(path: &Path) -> Result<HashMap<String, fasta::Record>, String> {
248 let reader = fasta::Reader::from_file(path)
249 .map_err(|e| format!("Failed to read sequence file: {} - {}", path.display(), e))?;
250
251 let mut seq_map = HashMap::new();
252 for record in reader.records() {
253 let record = record.map_err(|e| format!("Failed to parse FASTA: {}", e))?;
254 seq_map.insert(record.id().to_string(), record);
255 }
256 Ok(seq_map)
257 }
258
259 fn load_annotations(path: &Path) -> Result<Vec<gff::Record>, String> {
263 let mut reader = gff::Reader::from_file(path, GffType::GFF3)
264 .map_err(|e| format!("Failed to read annotation file: {} - {}", path.display(), e))?;
265
266 reader.records()
267 .map(|r| r.map_err(|e| format!("Failed to parse GFF: {}", e)))
268 .collect()
269 }
270
271 fn generate_annotated_file(
275 seq_data: &HashMap<String, fasta::Record>,
276 annotations: &[gff::Record],
277 output: &Path
278 ) -> Result<(), String> {
279 let mut writer = fasta::Writer::new(File::create(output)
280 .map_err(|e| format!("Failed to create output file: {} - {}", output.display(), e))?);
281
282 for ann in annotations {
284 let seq_id = ann.seqname();
285 let Some(seq) = seq_data.get(seq_id) else {
286 e_println("ANN-SKIP", &format!("Sequence not found: {}", seq_id));
287 continue;
288 };
289
290 let feature_seq = Self::extract_feature(seq, ann)
292 .map_err(|e| format!("Failed to extract feature: {}", e))?;
293
294 let description = format!("{}:{}-{} {}",
296 ann.feature_type(),
297 ann.start(),
298 ann.end(),
299 ann.attributes().get("ID").unwrap_or(&"unknown".to_string())
300 );
301
302 let new_record = fasta::Record::with_attrs(
304 ann.attributes().get("ID").unwrap_or(&ann.seqname().to_string()),
305 Some(&description),
306 &feature_seq
307 );
308 writer.write_record(&new_record)
309 .map_err(|e| format!("Write failed: {}", e))?;
310 }
311 Ok(())
312 }
313
314 fn extract_feature(seq: &fasta::Record, ann: &gff::Record) -> Result<Vec<u8>, String> {
318 let start = ann.start().saturating_sub(1).to_owned();
320 let end = ann.end().to_owned();
321
322 if start >= seq.seq().len() as u64 || end > seq.seq().len() as u64 {
324 return Err(format!("Invalid range: {}-{} (sequence length: {})",
325 ann.start(), ann.end(), seq.seq().len()));
326 }
327
328 Ok(seq.seq()[start as usize..end as usize].to_vec())
329 }
330}