1use std::collections::{HashMap, HashSet};
2use std::fs::File;
3use std::{fs, io};
4use std::io::BufRead;
5use crate::error::{e_exit, e_println};
6use crate::utils::{FileType, MultiFormatWriter};
7use bio::io::gff::GffType;
8use bio::io::{fasta, fastq, gff};
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11use rayon::iter::IntoParallelRefIterator;
12use rayon::iter::ParallelIterator;
13
14pub struct ExtractSegment;
15
16
17impl ExtractSegment {
21 pub fn extract_id(paths: Vec<PathBuf>, id: String, output: PathBuf) {
22 let id_set = vec![Self::normalize_id(&id)].into_iter().collect();
23 Self::process_files_parallel(paths, &id_set, &output)
24 }
25
26 pub fn extract_id_files(paths: Vec<PathBuf>, id_file: PathBuf, output: PathBuf) {
27 let id_set = match Self::load_id_set(&id_file) {
28 Ok(set) => set,
29 Err(e) => e_exit("ID-LOAD", &format!("Failed to load IDs: {}", e), 1),
30 };
31 Self::process_files_parallel(paths, &id_set, &output)
32 }
33
34 fn process_files_parallel(paths: Vec<PathBuf>, id_set: &HashSet<String>, output: &PathBuf) {
35 let writer = match MultiFormatWriter::new(output) {
36 Ok(w) => Arc::new(Mutex::new(w)),
37 Err(e) => e_exit("WRITER", &format!("Output init failed: {}", e), 2),
38 };
39
40 paths.par_iter().for_each(|path| {
41 let writer = Arc::clone(&writer);
42 match FileType::infer_file_type(path) {
43 FileType::Fasta => Self::process_file(path, id_set, writer, Self::process_fasta),
44 FileType::Gff => Self::process_file(path, id_set, writer, Self::process_gff),
45 FileType::Fastq => Self::process_file(path, id_set, writer, Self::process_fastq),
46 FileType::Unknown => e_println("TYPE-ERROR", &format!("Unsupported format: {:?}", path)),
47 };
48 });
49 }
50
51 fn process_file<P>(
52 path: &PathBuf,
53 ids: &HashSet<String>,
54 writer: Arc<Mutex<MultiFormatWriter>>,
55 processor: P,
56 ) where
57 P: Fn(&PathBuf, &HashSet<String>, &mut MultiFormatWriter),
58 {
59 let mut writer = writer.lock().unwrap_or_else(|e| {
60 e_exit("LOCK-ERROR", &format!("Writer lock failed: {}", e), 3)
61 });
62
63 processor(path, ids, &mut writer);
64 }
66
67 fn load_id_set(path: &PathBuf) -> io::Result<HashSet<String>> {
68 let file = File::open(path).map_err(|e| {
69 e_println("FILE-ERROR", &format!("Open {} failed: {}", path.display(), e));
70 e
71 })?;
72
73 let mut set = HashSet::new();
74 for line in io::BufReader::new(file).lines() {
75 let raw_id = line?.trim().to_string();
76 if !raw_id.is_empty() {
77 set.insert(Self::normalize_id(&raw_id));
78 }
79 }
80 Ok(set)
81 }
82
83 fn normalize_id(raw_id: &str) -> String {
84 raw_id.split(|c: char| c.is_whitespace() || c == '|' || c == ';')
85 .next()
86 .unwrap_or(raw_id)
87 .to_lowercase()
88 }
89
90 fn process_fasta(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
91 let reader = fasta::Reader::from_file(path).expect("");
92 for record in reader.records() {
93 let record = record.expect("");
94 if ids.contains(&Self::normalize_id(record.id())) {
95 writer.fa.write_record(&record).expect("");
96 }
97 }
98 }
99
100 fn process_gff(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
101 let mut reader = gff::Reader::from_file(path, GffType::GFF3).expect(""); for record in reader.records() {
103 let record = record.expect("");
104 if let Some(id) = record.attributes().get("ID") {
105 if ids.contains(&Self::normalize_id(id)) {
106 writer.gff.write(&record).expect("");
107 }
108 }
109 }
110 }
111
112 fn process_fastq(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
113 let reader = fastq::Reader::from_file(path).expect("");
114 for record in reader.records() {
115 let record = record.expect("");
116 if ids.contains(&Self::normalize_id(record.id())) {
117 writer.fq.write_record(&record).expect("");
118 }
119 }
120 }
121}
122
123
124
125pub struct ExtractExplain;
126
127impl ExtractExplain {
128 pub fn extract(seq_files: Vec<PathBuf>, anno_files: Vec<PathBuf>, output: PathBuf) {
129 fs::create_dir_all(&output).unwrap_or_else(|e| {
131 e_exit("FS", &format!("创建输出目录失败: {}", e), 1);
132 });
133
134 seq_files.par_iter().for_each(|seq_path| {
136 let seq_data = Self::load_sequences(seq_path)
138 .unwrap_or_else(|e| e_exit("SEQ-LOAD", &e, 2));
139
140 let annotations: Vec<_> = anno_files.par_iter()
142 .flat_map(|anno_path| {
143 Self::load_annotations(anno_path)
144 .unwrap_or_else(|e| e_exit("ANN-LOAD", &e, 3))
145 })
146 .collect();
147
148 let output_path = output.join(seq_path.file_name().unwrap());
150 Self::generate_annotated_file(&seq_data, &annotations, &output_path)
151 .unwrap_or_else(|e| e_exit("OUTPUT", &e, 4));
152 });
153 }
154
155 fn load_sequences(path: &Path) -> Result<HashMap<String, fasta::Record>, String> {
157 let reader = fasta::Reader::from_file(path)
158 .map_err(|e| format!("读取序列文件失败: {} - {}", path.display(), e))?;
159
160 let mut seq_map = HashMap::new();
161 for record in reader.records() {
162 let record = record.map_err(|e| format!("解析FASTA失败: {}", e))?;
163 seq_map.insert(record.id().to_string(), record);
164 }
165 Ok(seq_map)
166 }
167
168 fn load_annotations(path: &Path) -> Result<Vec<gff::Record>, String> {
170 let mut reader = gff::Reader::from_file(path, GffType::GFF3)
171 .map_err(|e| format!("读取注释文件失败: {} - {}", path.display(), e))?;
172
173 reader.records()
174 .map(|r| r.map_err(|e| format!("解析GFF失败: {}", e)))
175 .collect()
176 }
177
178 fn generate_annotated_file(
180 seq_data: &HashMap<String, fasta::Record>,
181 annotations: &[gff::Record],
182 output: &Path
183 ) -> Result<(), String> {
184 let mut writer = fasta::Writer::new(File::create(output)
185 .map_err(|e| format!("创建输出文件失败: {} - {}", output.display(), e))?);
186
187 for ann in annotations {
189 let seq_id = ann.seqname();
190 let Some(seq) = seq_data.get(seq_id) else {
191 e_println("ANN-SKIP", &format!("未找到序列: {}", seq_id));
192 continue;
193 };
194
195 let feature_seq = Self::extract_feature(seq, ann)
197 .map_err(|e| format!("提取特征失败: {}", e))?;
198
199 let description = format!("{}:{}-{} {}",
201 ann.feature_type(),
202 ann.start(),
203 ann.end(),
204 ann.attributes().get("ID").unwrap_or(&"unknown".to_string())
205 );
206
207 let new_record = fasta::Record::with_attrs(
209 ann.attributes().get("ID").unwrap_or(&ann.seqname().to_string()),
210 Some(&description),
211 &feature_seq
212 );
213 writer.write_record(&new_record)
214 .map_err(|e| format!("写入失败: {}", e))?;
215 }
216 Ok(())
217 }
218
219 fn extract_feature(seq: &fasta::Record, ann: &gff::Record) -> Result<Vec<u8>, String> {
221 let start = ann.start().saturating_sub(1); let &end = ann.end();
223
224 if start >= seq.seq().len() as u64 || end > seq.seq().len() as u64 {
225 return Err(format!("无效区间: {}-{} (序列长度: {})",
226 ann.start(), ann.end(), seq.seq().len()));
227 }
228
229 Ok(seq.seq()[start as usize..end as usize].to_vec())
230 }
231}
232