tf_binding_rs/
occupancy.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
use crate::error::MotifError;
use crate::types::*;
use polars::prelude::*;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::iter::Peekable;

/// Advances the iterator until a MOTIF line is found
fn skip_until_motif<I>(lines: &mut Peekable<I>)
where
    I: Iterator<Item = Result<String, std::io::Error>>,
{
    while let Some(Ok(line)) = lines.peek() {
        if line.starts_with("MOTIF") {
            break;
        }
        lines.next();
    }
}

/// Parses a single PWM from the iterator
fn parse_pwm<I>(lines: &mut I) -> Result<Option<(String, PWM)>, MotifError>
where
    I: Iterator<Item = Result<String, std::io::Error>>,
{
    // Get motif ID from MOTIF line
    let motif_line = match lines.next() {
        Some(Ok(line)) if line.starts_with("MOTIF") => line,
        _ => return Ok(None),
    };

    let motif_id = motif_line
        .split_whitespace()
        .nth(1)
        .ok_or_else(|| MotifError::InvalidFileFormat("Missing motif ID".into()))?
        .to_string();

    // Skip header lines
    for _ in 0..2 {
        lines.next();
    }

    // Read PWM rows until we hit a non-PWM line
    let pwm_rows: Vec<Vec<f64>> = lines
        .take_while(|line| {
            line.as_ref()
                .map(|l| l.starts_with(|c: char| c.is_whitespace() || c == '0' || c == '1'))
                .unwrap_or(false)
        })
        .map(|line| {
            let line = line.map_err(|e| MotifError::Io(e))?;
            let values: Vec<f64> = line
                .split_whitespace()
                .map(|s| s.parse::<f64>())
                .collect::<Result<Vec<_>, _>>()
                .map_err(|e| MotifError::InvalidFileFormat(format!("Invalid PWM value: {}", e)))?;

            Ok(values)
        })
        .collect::<Result<Vec<_>, MotifError>>()?;

    if pwm_rows.is_empty() {
        return Err(MotifError::InvalidFileFormat("Empty PWM".into()));
    }

    // Create PWM DataFrame
    let pwm = DataFrame::new(vec![
        Column::new(
            "A".into(),
            pwm_rows.iter().map(|row| row[0]).collect::<Vec<f64>>(),
        ),
        Column::new(
            "C".into(),
            pwm_rows.iter().map(|row| row[1]).collect::<Vec<f64>>(),
        ),
        Column::new(
            "G".into(),
            pwm_rows.iter().map(|row| row[2]).collect::<Vec<f64>>(),
        ),
        Column::new(
            "T".into(),
            pwm_rows.iter().map(|row| row[3]).collect::<Vec<f64>>(),
        ),
    ])
    .map_err(|e| MotifError::DataError(e.to_string()))?;

    Ok(Some((motif_id, pwm)))
}

/// Reads Position Weight Matrices (PWMs) from a MEME format file
///
/// This function parses a MEME-formatted file containing one or more Position Weight Matrices,
/// each identified by a unique motif ID. The PWMs represent DNA binding motifs where each position
/// contains probabilities for the four nucleotides (A, C, G, T).
///
/// # Arguments
/// * `filename` - Path to the MEME format file to read
///
/// # Returns
/// * `Result<PWMCollection, MotifError>` - A HashMap where keys are motif IDs and values are their corresponding PWMs
///
/// # Errors
/// * `MotifError::Io` - If the file cannot be opened or read
/// * `MotifError::InvalidFileFormat` - If the file format is invalid or no PWMs are found
/// * `MotifError::DataError` - If there are issues creating the PWM DataFrame
///
/// # Example
/// ```ignore
/// use tf_binding_rs::occupancy::read_pwm_files;
///
/// let pwms = read_pwm_files("path/to/motifs.meme").unwrap();
/// for (motif_id, pwm) in pwms {
///     println!("Found motif: {}", motif_id);
/// }
/// ```
///
/// # Format
/// The input file should be in MEME format, where each PWM is preceded by a "MOTIF" line
/// containing the motif ID, followed by the matrix values.
pub fn read_pwm_files(filename: &str) -> Result<PWMCollection, MotifError> {
    let file = File::open(filename)?;
    let reader = BufReader::new(file);
    let mut lines = reader.lines().peekable();
    let mut pwms = HashMap::new();

    // Skip header until first MOTIF
    skip_until_motif(&mut lines);

    // Parse all PWMs
    while let Some((id, pwm)) = parse_pwm(&mut lines)? {
        pwms.insert(id, pwm);
        skip_until_motif(&mut lines);
    }

    if pwms.is_empty() {
        return Err(MotifError::InvalidFileFormat("No PWMs found".into()));
    }

    Ok(pwms)
}