tf_binding_rs/
fasta.rs

1use crate::error::MotifError;
2use polars::prelude::*;
3use std::collections::HashSet;
4use std::fs::File;
5use std::io::{BufRead, BufReader, Write};
6
7/// Reads sequences from a FASTA format file and converts them into a Polars DataFrame.
8///
9/// # Arguments
10/// * `filename` - Path to the FASTA file to read
11///
12/// # Returns
13/// * `Result<DataFrame>` - A DataFrame with two columns:
14///   - "label": The sequence identifiers (without '>' prefix)
15///   - "sequence": The corresponding DNA/RNA sequences in uppercase
16///
17/// # Errors
18/// * Returns `MotifError::InvalidFileFormat` if no sequences are found
19/// * Returns `MotifError::DataError` if DataFrame creation fails
20/// * Returns `std::io::Error` for file reading issues
21pub fn read_fasta(filename: &str) -> Result<DataFrame, MotifError> {
22    let mut sequences: Vec<(String, String)> = Vec::new();
23    let file = File::open(filename)?;
24    let reader = BufReader::new(file);
25
26    let mut current_header = String::new();
27    let mut current_sequence = String::new();
28
29    for line in reader.lines() {
30        let line = line?;
31        let line = line.trim();
32
33        if line.starts_with('>') {
34            if !current_header.is_empty() {
35                sequences.push((current_header, current_sequence.to_uppercase()));
36                current_sequence.clear();
37            }
38            current_header = line[1..].to_string();
39        } else if !line.is_empty() {
40            current_sequence.push_str(line);
41        }
42    }
43
44    if !current_header.is_empty() {
45        sequences.push((current_header, current_sequence.to_uppercase()));
46    }
47
48    if sequences.is_empty() {
49        return Err(MotifError::InvalidFileFormat(
50            "No sequences found".to_string(),
51        ));
52    }
53
54    let (labels, sequences): (Vec<String>, Vec<String>) = sequences.into_iter().unzip();
55    let df = DataFrame::new(vec![
56        Column::new("label".into(), labels),
57        Column::new("sequence".into(), sequences),
58    ])
59    .map_err(|e| MotifError::DataError(e.to_string()))?;
60
61    Ok(df)
62}
63
64/// Writes sequences from a Polars DataFrame to a FASTA format file.
65///
66/// # Arguments
67/// * `df` - DataFrame containing sequences with "label" and "sequence" columns
68/// * `filename` - Path where the FASTA file should be written
69///
70/// # Returns
71/// * `Result<()>` - Unit type if successful
72///
73/// # Errors
74/// * Returns `MotifError::DataError` if required columns are missing
75/// * Returns `MotifError::Io` for file writing issues
76pub fn write_fasta(df: &DataFrame, filename: &str) -> Result<(), MotifError> {
77    let labels = df
78        .column("label")
79        .map_err(|e| MotifError::DataError(e.to_string()))?
80        .str()
81        .unwrap();
82    let sequences = df
83        .column("sequence")
84        .map_err(|e| MotifError::DataError(e.to_string()))?
85        .str()
86        .unwrap();
87
88    let mut file = File::create(filename).map_err(MotifError::Io)?;
89
90    for idx in 0..df.height() {
91        let label = labels.get(idx).unwrap();
92        let sequence = sequences.get(idx).unwrap();
93
94        writeln!(file, ">{}", label).map_err(MotifError::Io)?;
95        writeln!(file, "{}", sequence).map_err(MotifError::Io)?;
96    }
97
98    Ok(())
99}
100
101/// Generates the reverse complement of a DNA sequence.
102///
103/// # Arguments
104/// * `sequence` - Input DNA sequence string
105///
106/// # Returns
107/// * `Result<String>` - The reverse complement sequence where:
108///   - A ↔ T
109///   - C ↔ G
110///
111/// # Errors
112/// * Returns `MotifError::InvalidInput` if sequence contains invalid nucleotides
113pub fn reverse_complement(sequence: &str) -> Result<String, MotifError> {
114    static COMPLEMENT: phf::Map<char, char> = phf::phf_map! {
115        'A' => 'T',
116        'T' => 'A',
117        'C' => 'G',
118        'G' => 'C',
119    };
120
121    sequence
122        .chars()
123        .rev()
124        .map(|c| {
125            COMPLEMENT
126                .get(&c)
127                .ok_or_else(|| MotifError::InvalidInput(format!("Invalid nucleotide: {}", c)))
128        })
129        .collect()
130}
131
132/// Calculates the GC content for each sequence in the input DataFrame.
133///
134/// # Arguments
135/// * `df` - DataFrame containing sequences with "label" and "sequence" columns
136///
137/// # Returns
138/// * `Result<DataFrame>` - A DataFrame with:
139///   - Original labels
140///   - "gc_content": Fraction of G and C bases in each sequence
141///
142/// # Errors
143/// * Returns `MotifError::DataError` if required columns are missing or DataFrame creation fails
144pub fn gc_content(df: &DataFrame) -> Result<DataFrame, MotifError> {
145    let sequences = df
146        .column("sequence")
147        .map_err(|e| MotifError::DataError(e.to_string()))?
148        .str()
149        .unwrap();
150
151    let gc_content: Vec<f64> = sequences
152        .into_iter()
153        .map(|seq| {
154            let seq = seq.unwrap();
155            let gc_count = seq.chars().filter(|&c| c == 'G' || c == 'C').count() as f64;
156            gc_count / seq.len() as f64
157        })
158        .collect();
159
160    let labels = df
161        .column("label")
162        .map_err(|e| MotifError::DataError(e.to_string()))?;
163
164    let new_df = DataFrame::new(vec![
165        labels.clone(),
166        Column::new("gc_content".into(), gc_content),
167    ])
168    .map_err(|e| MotifError::DataError(e.to_string()))?;
169
170    Ok(new_df)
171}
172
173/// Identifies sequences containing specified restriction sites.
174///
175/// # Arguments
176/// * `df` - DataFrame containing sequences with "label" and "sequence" columns
177/// * `restrictions` - Slice of restriction site patterns to search for
178///
179/// # Returns
180/// * `Result<DataFrame>` - A DataFrame with:
181///   - Original labels
182///   - "has_restriction_sites": Boolean indicating if any restriction site was found
183///
184/// # Errors
185/// * Returns `MotifError::DataError` if required columns are missing or DataFrame creation fails
186pub fn has_restriction_sites(
187    df: &DataFrame,
188    restrictions: &[&str],
189) -> Result<DataFrame, MotifError> {
190    let restrictions_set: HashSet<String> = restrictions.iter().map(|r| r.to_string()).collect();
191
192    let sequences = df
193        .column("sequence")
194        .map_err(|e| MotifError::DataError(e.to_string()))?
195        .str()
196        .unwrap();
197
198    let mask: Vec<bool> = sequences
199        .into_iter()
200        .map(|seq| {
201            let seq = seq.unwrap();
202            restrictions_set.iter().any(|r| seq.contains(r))
203        })
204        .collect();
205
206    let labels = df
207        .column("label")
208        .map_err(|e| MotifError::DataError(e.to_string()))?;
209
210    let new_df = DataFrame::new(vec![
211        labels.clone(),
212        Column::new("has_restriction_sites".into(), mask),
213    ])
214    .map_err(|e| MotifError::DataError(e.to_string()))?;
215
216    Ok(new_df)
217}