seq_here/
extract.rs

1use std::collections::{HashMap, HashSet};
2use std::fs::File;
3use std::{fs, io};
4use std::io::BufRead;
5use crate::error::{e_exit, e_println};
6use crate::utils::{FileType, MultiFormatWriter};
7use bio::io::gff::GffType;
8use bio::io::{fasta, fastq, gff};
9use std::path::{Path, PathBuf};
10use std::sync::{Arc, Mutex};
11use rayon::iter::IntoParallelRefIterator;
12use rayon::iter::ParallelIterator;
13
14pub struct ExtractSegment;
15
16
17// TODO: .expect("")  msg
18// TODO:
19
20impl ExtractSegment {
21    pub fn extract_id(paths: Vec<PathBuf>, id: String, output: PathBuf) {
22        let id_set = vec![Self::normalize_id(&id)].into_iter().collect();
23        Self::process_files_parallel(paths, &id_set, &output)
24    }
25
26    pub fn extract_id_files(paths: Vec<PathBuf>, id_file: PathBuf, output: PathBuf) {
27        let id_set = match Self::load_id_set(&id_file) {
28            Ok(set) => set,
29            Err(e) => e_exit("ID-LOAD", &format!("Failed to load IDs: {}", e), 1),
30        };
31        Self::process_files_parallel(paths, &id_set, &output)
32    }
33
34    fn process_files_parallel(paths: Vec<PathBuf>, id_set: &HashSet<String>, output: &PathBuf) {
35        let writer = match MultiFormatWriter::new(output) {
36            Ok(w) => Arc::new(Mutex::new(w)),
37            Err(e) => e_exit("WRITER", &format!("Output init failed: {}", e), 2),
38        };
39
40        paths.par_iter().for_each(|path| {
41            let writer = Arc::clone(&writer);
42            match FileType::infer_file_type(path) {
43                FileType::Fasta => Self::process_file(path, id_set, writer, Self::process_fasta),
44                FileType::Gff => Self::process_file(path, id_set, writer, Self::process_gff),
45                FileType::Fastq => Self::process_file(path, id_set, writer, Self::process_fastq),
46                FileType::Unknown => e_println("TYPE-ERROR", &format!("Unsupported format: {:?}", path)),
47            };
48        });
49    }
50
51    fn process_file<P>(
52        path: &PathBuf,
53        ids: &HashSet<String>,
54        writer: Arc<Mutex<MultiFormatWriter>>,
55        processor: P,
56    ) where
57        P: Fn(&PathBuf, &HashSet<String>, &mut MultiFormatWriter),
58    {
59        let mut writer = writer.lock().unwrap_or_else(|e| {
60            e_exit("LOCK-ERROR", &format!("Writer lock failed: {}", e), 3)
61        });
62
63        processor(path, ids, &mut writer);
64            // .unwrap_or_else(|e| { e_exit("PROCESS-ERROR", &format!("Process {} failed: {}", path.display(), e), 4) } );
65    }
66
67    fn load_id_set(path: &PathBuf) -> io::Result<HashSet<String>> {
68        let file = File::open(path).map_err(|e| {
69            e_println("FILE-ERROR", &format!("Open {} failed: {}", path.display(), e));
70            e
71        })?;
72
73        let mut set = HashSet::new();
74        for line in io::BufReader::new(file).lines() {
75            let raw_id = line?.trim().to_string();
76            if !raw_id.is_empty() {
77                set.insert(Self::normalize_id(&raw_id));
78            }
79        }
80        Ok(set)
81    }
82
83    fn normalize_id(raw_id: &str) -> String {
84        raw_id.split(|c: char| c.is_whitespace() || c == '|' || c == ';')
85            .next()
86            .unwrap_or(raw_id)
87            .to_lowercase()
88    }
89
90    fn process_fasta(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
91        let reader = fasta::Reader::from_file(path).expect("");
92        for record in reader.records() {
93            let record = record.expect("");
94            if ids.contains(&Self::normalize_id(record.id())) {
95                writer.fa.write_record(&record).expect("");
96            }
97        }
98    }
99
100    fn process_gff(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
101        let mut reader = gff::Reader::from_file(path, GffType::GFF3).expect("");    //TODO: GFF TYPE
102        for record in reader.records() {
103            let record = record.expect("");
104            if let Some(id) = record.attributes().get("ID") {
105                if ids.contains(&Self::normalize_id(id)) {
106                    writer.gff.write(&record).expect("");
107                }
108            }
109        }
110    }
111
112    fn process_fastq(path: &PathBuf, ids: &HashSet<String>, writer: &mut MultiFormatWriter) {
113        let reader = fastq::Reader::from_file(path).expect("");
114        for record in reader.records() {
115            let record = record.expect("");
116            if ids.contains(&Self::normalize_id(record.id())) {
117                writer.fq.write_record(&record).expect("");
118            }
119        }
120    }
121}
122
123
124
125pub struct ExtractExplain;
126
127impl ExtractExplain {
128    pub fn extract(seq_files: Vec<PathBuf>, anno_files: Vec<PathBuf>, output: PathBuf) {
129        // 创建输出目录
130        fs::create_dir_all(&output).unwrap_or_else(|e| {
131            e_exit("FS", &format!("创建输出目录失败: {}", e), 1);
132        });
133
134        // 并行处理每个序列文件
135        seq_files.par_iter().for_each(|seq_path| {
136            // 加载序列数据
137            let seq_data = Self::load_sequences(seq_path)
138                .unwrap_or_else(|e| e_exit("SEQ-LOAD", &e, 2));
139
140            // 处理所有注释文件
141            let annotations: Vec<_> = anno_files.par_iter()
142                .flat_map(|anno_path| {
143                    Self::load_annotations(anno_path)
144                        .unwrap_or_else(|e| e_exit("ANN-LOAD", &e, 3))
145                })
146                .collect();
147
148            // 生成注释结果
149            let output_path = output.join(seq_path.file_name().unwrap());
150            Self::generate_annotated_file(&seq_data, &annotations, &output_path)
151                .unwrap_or_else(|e| e_exit("OUTPUT", &e, 4));
152        });
153    }
154
155    /// 加载序列数据到内存(适合中小型文件)
156    fn load_sequences(path: &Path) -> Result<HashMap<String, fasta::Record>, String> {
157        let reader = fasta::Reader::from_file(path)
158            .map_err(|e| format!("读取序列文件失败: {} - {}", path.display(), e))?;
159
160        let mut seq_map = HashMap::new();
161        for record in reader.records() {
162            let record = record.map_err(|e| format!("解析FASTA失败: {}", e))?;
163            seq_map.insert(record.id().to_string(), record);
164        }
165        Ok(seq_map)
166    }
167
168    /// 加载GFF注释信息
169    fn load_annotations(path: &Path) -> Result<Vec<gff::Record>, String> {
170        let mut reader = gff::Reader::from_file(path, GffType::GFF3)
171            .map_err(|e| format!("读取注释文件失败: {} - {}", path.display(), e))?;
172
173        reader.records()
174            .map(|r| r.map_err(|e| format!("解析GFF失败: {}", e)))
175            .collect()
176    }
177
178    /// 生成带注释的序列文件
179    fn generate_annotated_file(
180        seq_data: &HashMap<String, fasta::Record>,
181        annotations: &[gff::Record],
182        output: &Path
183    ) -> Result<(), String> {
184        let mut writer = fasta::Writer::new(File::create(output)
185            .map_err(|e| format!("创建输出文件失败: {} - {}", output.display(), e))?);
186
187        // 为每个注释生成特征序列
188        for ann in annotations {
189            let seq_id = ann.seqname();
190            let Some(seq) = seq_data.get(seq_id) else {
191                e_println("ANN-SKIP", &format!("未找到序列: {}", seq_id));
192                continue;
193            };
194
195            // 提取注释区间序列
196            let feature_seq = Self::extract_feature(seq, ann)
197                .map_err(|e| format!("提取特征失败: {}", e))?;
198
199            // 生成描述信息
200            let description = format!("{}:{}-{} {}",
201                                      ann.feature_type(),
202                                      ann.start(),
203                                      ann.end(),
204                                      ann.attributes().get("ID").unwrap_or(&"unknown".to_string())
205            );
206
207            // 写入新记录
208            let new_record = fasta::Record::with_attrs(
209                ann.attributes().get("ID").unwrap_or(&ann.seqname().to_string()),
210                Some(&description),
211                &feature_seq
212            );
213            writer.write_record(&new_record)
214                .map_err(|e| format!("写入失败: {}", e))?;
215        }
216        Ok(())
217    }
218
219    /// 从序列中提取特征区间
220    fn extract_feature(seq: &fasta::Record, ann: &gff::Record) -> Result<Vec<u8>, String> {
221        let start = ann.start().saturating_sub(1); // GFF是1-based
222        let &end = ann.end();
223
224        if start >= seq.seq().len() as u64 || end > seq.seq().len() as u64 {
225            return Err(format!("无效区间: {}-{} (序列长度: {})",
226                               ann.start(), ann.end(), seq.seq().len()));
227        }
228
229        Ok(seq.seq()[start as usize..end as usize].to_vec())
230    }
231}
232