1use crate::error::MotifError;
2use polars::prelude::*;
3use std::collections::HashSet;
4use std::fs::File;
5use std::io::{BufRead, BufReader, Write};
6
7pub fn read_fasta(filename: &str) -> Result<DataFrame, MotifError> {
22 let mut sequences: Vec<(String, String)> = Vec::new();
23 let file = File::open(filename)?;
24 let reader = BufReader::new(file);
25
26 let mut current_header = String::new();
27 let mut current_sequence = String::new();
28
29 for line in reader.lines() {
30 let line = line?;
31 let line = line.trim();
32
33 if line.starts_with('>') {
34 if !current_header.is_empty() {
35 sequences.push((current_header, current_sequence.to_uppercase()));
36 current_sequence.clear();
37 }
38 current_header = line[1..].to_string();
39 } else if !line.is_empty() {
40 current_sequence.push_str(line);
41 }
42 }
43
44 if !current_header.is_empty() {
45 sequences.push((current_header, current_sequence.to_uppercase()));
46 }
47
48 if sequences.is_empty() {
49 return Err(MotifError::InvalidFileFormat(
50 "No sequences found".to_string(),
51 ));
52 }
53
54 let (labels, sequences): (Vec<String>, Vec<String>) = sequences.into_iter().unzip();
55 let df = DataFrame::new(vec![
56 Column::new("label".into(), labels),
57 Column::new("sequence".into(), sequences),
58 ])
59 .map_err(|e| MotifError::DataError(e.to_string()))?;
60
61 Ok(df)
62}
63
64pub fn write_fasta(df: &DataFrame, filename: &str) -> Result<(), MotifError> {
77 let labels = df
78 .column("label")
79 .map_err(|e| MotifError::DataError(e.to_string()))?
80 .str()
81 .unwrap();
82 let sequences = df
83 .column("sequence")
84 .map_err(|e| MotifError::DataError(e.to_string()))?
85 .str()
86 .unwrap();
87
88 let mut file = File::create(filename).map_err(MotifError::Io)?;
89
90 for idx in 0..df.height() {
91 let label = labels.get(idx).unwrap();
92 let sequence = sequences.get(idx).unwrap();
93
94 writeln!(file, ">{}", label).map_err(MotifError::Io)?;
95 writeln!(file, "{}", sequence).map_err(MotifError::Io)?;
96 }
97
98 Ok(())
99}
100
101pub fn reverse_complement(sequence: &str) -> Result<String, MotifError> {
114 static COMPLEMENT: phf::Map<char, char> = phf::phf_map! {
115 'A' => 'T',
116 'T' => 'A',
117 'C' => 'G',
118 'G' => 'C',
119 };
120
121 sequence
122 .chars()
123 .rev()
124 .map(|c| {
125 COMPLEMENT
126 .get(&c)
127 .ok_or_else(|| MotifError::InvalidInput(format!("Invalid nucleotide: {}", c)))
128 })
129 .collect()
130}
131
132pub fn gc_content(df: &DataFrame) -> Result<DataFrame, MotifError> {
145 let sequences = df
146 .column("sequence")
147 .map_err(|e| MotifError::DataError(e.to_string()))?
148 .str()
149 .unwrap();
150
151 let gc_content: Vec<f64> = sequences
152 .into_iter()
153 .map(|seq| {
154 let seq = seq.unwrap();
155 let gc_count = seq.chars().filter(|&c| c == 'G' || c == 'C').count() as f64;
156 gc_count / seq.len() as f64
157 })
158 .collect();
159
160 let labels = df
161 .column("label")
162 .map_err(|e| MotifError::DataError(e.to_string()))?;
163
164 let new_df = DataFrame::new(vec![
165 labels.clone(),
166 Column::new("gc_content".into(), gc_content),
167 ])
168 .map_err(|e| MotifError::DataError(e.to_string()))?;
169
170 Ok(new_df)
171}
172
173pub fn has_restriction_sites(
187 df: &DataFrame,
188 restrictions: &[&str],
189) -> Result<DataFrame, MotifError> {
190 let restrictions_set: HashSet<String> = restrictions.iter().map(|r| r.to_string()).collect();
191
192 let sequences = df
193 .column("sequence")
194 .map_err(|e| MotifError::DataError(e.to_string()))?
195 .str()
196 .unwrap();
197
198 let mask: Vec<bool> = sequences
199 .into_iter()
200 .map(|seq| {
201 let seq = seq.unwrap();
202 restrictions_set.iter().any(|r| seq.contains(r))
203 })
204 .collect();
205
206 let labels = df
207 .column("label")
208 .map_err(|e| MotifError::DataError(e.to_string()))?;
209
210 let new_df = DataFrame::new(vec![
211 labels.clone(),
212 Column::new("has_restriction_sites".into(), mask),
213 ])
214 .map_err(|e| MotifError::DataError(e.to_string()))?;
215
216 Ok(new_df)
217}