1use std::collections::{HashMap, HashSet};
2use std::fs::File;
3use std::{fs, io};
4use std::io::BufRead;
5use crate::error::{e_exit, e_println};
6use crate::utils::{FileType, MultiFormatWriter};
7use bio::io::gff::GffType;
8use bio::io::{fasta, fastq, gff};
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11use rayon::iter::IntoParallelRefIterator;
12use rayon::iter::ParallelIterator;
13
14pub struct ExtractSegment;
16
17impl ExtractSegment {
18 pub fn extract_id(paths: Vec<PathBuf>, id: String, output: PathBuf) {
25 let id_set = vec![Self::normalize_id(&id)].into_iter().collect();
26 Self::process_files_parallel(paths, &id_set, &output)
27 }
28
29 pub fn extract_id_files(paths: Vec<PathBuf>, id_file: PathBuf, output: PathBuf) {
36 let id_set = match Self::load_id_set(&id_file) {
37 Ok(set) => set,
38 Err(e) => e_exit("ID-LOAD", &format!("Failed to load IDs: {}", e), 1),
39 };
40 Self::process_files_parallel(paths, &id_set, &output)
41 }
42
43 fn process_files_parallel(paths: Vec<PathBuf>, id_set: &HashSet<String>, output: &PathBuf) {
45 let writer = match MultiFormatWriter::new(output) {
46 Ok(w) => Arc::new(Mutex::new(w)),
47 Err(e) => e_exit("WRITER", &format!("Output init failed: {}", e), 2),
48 };
49
50 paths.par_iter().for_each(|path| {
51 let writer = Arc::clone(&writer);
52 match FileType::infer_file_type(path) {
53 FileType::Fasta => Self::process_file(path, id_set, writer, Self::process_fasta),
54 FileType::Gff => Self::process_file(path, id_set, writer, Self::process_gff),
55 FileType::Fastq => Self::process_file(path, id_set, writer, Self::process_fastq),
56 FileType::Unknown => e_println("TYPE-ERROR", &format!("Unsupported format: {:?}", path)),
57 };
58 });
59 }
60
61 fn process_file<P>(
63 path: &PathBuf,
64 ids: &HashSet<String>,
65 writer: Arc<Mutex<MultiFormatWriter>>,
66 processor: P,
67 ) where
68 P: Fn(&PathBuf, &HashSet<String>, &mut MultiFormatWriter),
69 {
70 let mut writer = writer.lock().unwrap_or_else(|e| {
71 e_exit("LOCK-ERROR", &format!("Writer lock failed: {}", e), 3)
72 });
73
74 processor(path, ids, &mut writer);
75 }
76
77 fn load_id_set(path: &PathBuf) -> io::Result<HashSet<String>> {
79 let file = File::open(path).map_err(|e| {
80 e_println("FILE-ERROR", &format!("Open {} failed: {}", path.display(), e));
81 e
82 })?;
83
84 let mut set = HashSet::new();
85 for line in io::BufReader::new(file).lines() {
86 let raw_id = line?.trim().to_string();
87 if !raw_id.is_empty() {
88 set.insert(Self::normalize_id(&raw_id));
89 }
90 }
91 Ok(set)
92 }
93
94 fn normalize_id(raw_id: &str) -> String {
99 raw_id.split(|c: char| c.is_whitespace() || c == '|' || c == ';')
100 .next()
101 .unwrap_or(raw_id)
102 .to_lowercase()
103 }
104
105 fn process_fasta(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
107 let reader = fasta::Reader::from_file(path)
108 .expect(&format!("Failed to open FASTA file: {}", path.display()));
109
110 for record in reader.records() {
111 let record = record
112 .expect(&format!("Failed to parse FASTA record in {}", path.display()));
113
114 if ids.contains(&Self::normalize_id(record.id())) {
115 writer.fa.write_record(&record)
116 .expect(&format!("Failed to write FASTA record: {}", record.id()));
117 }
118 }
119 }
120
121 fn process_gff(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
123 let mut reader = gff::Reader::from_file(path, GffType::GFF3)
124 .expect(&format!("Failed to open GFF file: {}", path.display()));
125
126 for record in reader.records() {
127 let record = record
128 .expect(&format!("Failed to parse GFF record in {}", path.display()));
129
130 if let Some(id) = record.attributes().get("ID") {
131 if ids.contains(&Self::normalize_id(id)) {
132 writer.gff.write(&record)
133 .expect(&format!("Failed to write GFF record: {}", id));
134 }
135 }
136 }
137 }
138
139 fn process_fastq(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
141 let reader = fastq::Reader::from_file(path)
142 .expect(&format!("Failed to open FASTQ file: {}", path.display()));
143
144 for record in reader.records() {
145 let record = record
146 .expect(&format!("Failed to parse FASTQ record in {}", path.display()));
147
148 if ids.contains(&Self::normalize_id(record.id())) {
149 writer.fq.write_record(&record)
150 .expect(&format!("Failed to write FASTQ record: {}", record.id()));
151 }
152 }
153 }
154}
155
156pub struct ExtractExplain;
158
159impl ExtractExplain {
160 pub fn extract(seq_files: Vec<PathBuf>, anno_files: Vec<PathBuf>, output: PathBuf) {
167 fs::create_dir_all(&output).unwrap_or_else(|e| {
169 e_exit("FS", &format!("Failed to create output directory: {}", e), 1);
170 });
171
172 seq_files.par_iter().for_each(|seq_path| {
174 let seq_data = Self::load_sequences(seq_path)
176 .unwrap_or_else(|e| e_exit("SEQ-LOAD", &e, 2));
177
178 let annotations: Vec<_> = anno_files.par_iter()
180 .flat_map(|anno_path| {
181 Self::load_annotations(anno_path)
182 .unwrap_or_else(|e| e_exit("ANN-LOAD", &e, 3))
183 })
184 .collect();
185
186 let output_path = output.join(seq_path.file_name().unwrap());
188 Self::generate_annotated_file(&seq_data, &annotations, &output_path)
189 .unwrap_or_else(|e| e_exit("OUTPUT", &e, 4));
190 });
191 }
192
193 fn load_sequences(path: &Path) -> Result<HashMap<String, fasta::Record>, String> {
197 let reader = fasta::Reader::from_file(path)
198 .map_err(|e| format!("Failed to read sequence file: {} - {}", path.display(), e))?;
199
200 let mut seq_map = HashMap::new();
201 for record in reader.records() {
202 let record = record.map_err(|e| format!("Failed to parse FASTA: {}", e))?;
203 seq_map.insert(record.id().to_string(), record);
204 }
205 Ok(seq_map)
206 }
207
208 fn load_annotations(path: &Path) -> Result<Vec<gff::Record>, String> {
212 let mut reader = gff::Reader::from_file(path, GffType::GFF3)
213 .map_err(|e| format!("Failed to read annotation file: {} - {}", path.display(), e))?;
214
215 reader.records()
216 .map(|r| r.map_err(|e| format!("Failed to parse GFF: {}", e)))
217 .collect()
218 }
219
220 fn generate_annotated_file(
224 seq_data: &HashMap<String, fasta::Record>,
225 annotations: &[gff::Record],
226 output: &Path
227 ) -> Result<(), String> {
228 let mut writer = fasta::Writer::new(File::create(output)
229 .map_err(|e| format!("Failed to create output file: {} - {}", output.display(), e))?);
230
231 for ann in annotations {
233 let seq_id = ann.seqname();
234 let Some(seq) = seq_data.get(seq_id) else {
235 e_println("ANN-SKIP", &format!("Sequence not found: {}", seq_id));
236 continue;
237 };
238
239 let feature_seq = Self::extract_feature(seq, ann)
241 .map_err(|e| format!("Failed to extract feature: {}", e))?;
242
243 let description = format!("{}:{}-{} {}",
245 ann.feature_type(),
246 ann.start(),
247 ann.end(),
248 ann.attributes().get("ID").unwrap_or(&"unknown".to_string())
249 );
250
251 let new_record = fasta::Record::with_attrs(
253 ann.attributes().get("ID").unwrap_or(&ann.seqname().to_string()),
254 Some(&description),
255 &feature_seq
256 );
257 writer.write_record(&new_record)
258 .map_err(|e| format!("Write failed: {}", e))?;
259 }
260 Ok(())
261 }
262
263 fn extract_feature(seq: &fasta::Record, ann: &gff::Record) -> Result<Vec<u8>, String> {
267 let start = ann.start().saturating_sub(1).to_owned();
269 let end = ann.end().to_owned();
270
271 if start >= seq.seq().len() as u64 || end > seq.seq().len() as u64 {
273 return Err(format!("Invalid range: {}-{} (sequence length: {})",
274 ann.start(), ann.end(), seq.seq().len()));
275 }
276
277 Ok(seq.seq()[start as usize..end as usize].to_vec())
278 }
279}