nanalogue_core/utils/
mod_char.rs

1//! `ModChar` struct for handling DNA modification tags
2//! Handles both letter and numeric modification codes from BAM files
3
4use crate::Error;
5use serde::{Deserialize, Serialize};
6use std::fmt;
7use std::str::FromStr;
8
9/// Our struct to hold a modification tag.
10/// The BAM file format uses the syntax `base+mod_code` in its ML tag
11/// to show which modification is represented e.g. C+m, A+a, T+T, ...
12/// This can be a letter or a number e.g. T+472232 represents `BrdU` as that is its `CheBI` code.
13/// As we rely on a fibertools-rs data structure to store mod information (`BaseMod`),
14/// which uses a char datatype to represent the mod code, representing a one letter
15/// mod code is easy, but numbers need to be converted to char first.
16/// Fortunately, rust's char datatype is almost equivalent to u32, so we just
17/// convert numbers to char before storing.
18/// NOTE: the above conversion has some problems e.g. A+a is equivalent to A+97 etc.
19/// (as 97 is the ascii code of a),
20/// and the rust char datatype does not allow a set of u32s somewhere between 55000
21/// and 59000. We have chosen to live with this problem. I think the probability of
22/// having a DNA modification with a `CheBI` code overlapping with ASCII values or
23/// within this narrow range of values near 59000 is very small.
24#[derive(Debug, Clone, Copy, Eq, Hash, Ord, PartialEq, PartialOrd)]
25pub struct ModChar(char);
26
27/// Defaults to mod tag 'N'
28impl Default for ModChar {
29    fn default() -> Self {
30        ModChar::new('N')
31    }
32}
33
34impl ModChar {
35    /// We initialize with a character
36    #[must_use]
37    pub fn new(val: char) -> Self {
38        ModChar(val)
39    }
40    /// Returns the character
41    ///
42    /// ```
43    /// use nanalogue_core::ModChar;
44    /// for y in vec!['a','b','c','\u{D000}']{
45    ///     let x = ModChar::new(y.clone());
46    ///     assert_eq!(x.val(), y);
47    /// }
48    /// ```
49    #[must_use]
50    pub fn val(&self) -> char {
51        self.0
52    }
53}
54
55impl From<char> for ModChar {
56    fn from(value: char) -> Self {
57        ModChar::new(value)
58    }
59}
60
61impl From<u8> for ModChar {
62    fn from(value: u8) -> Self {
63        ModChar::new(char::from(value))
64    }
65}
66
67impl FromStr for ModChar {
68    type Err = Error;
69
70    /// process the modification type from a string,
71    /// returning the first character if it is a letter,
72    /// or converting it to a character if the first character is a number
73    ///
74    /// ```
75    /// use nanalogue_core::ModChar;
76    /// use std::str::FromStr;
77    ///
78    /// // Single letter modification codes
79    /// let mod_char = ModChar::from_str("m")?;
80    /// assert_eq!(mod_char.val(), 'm');
81    /// # Ok::<(), nanalogue_core::Error>(())
82    /// ```
83    ///
84    /// ```
85    /// # use nanalogue_core::ModChar;
86    /// # use std::str::FromStr;
87    /// #
88    /// // CheBI code for BrdU (5-bromo-2'-deoxyuridine)
89    /// let mod_char = ModChar::from_str("472232")?;
90    /// # Ok::<(), nanalogue_core::Error>(())
91    /// ```
92    ///
93    /// ```
94    /// # use nanalogue_core::ModChar;
95    /// # use std::str::FromStr;
96    /// #
97    /// // Small numeric codes
98    /// let mod_char = ModChar::from_str("123")?;
99    /// # Ok::<(), nanalogue_core::Error>(())
100    /// ```
101    ///
102    /// ```should_panic
103    /// # use nanalogue_core::ModChar;
104    /// # use std::str::FromStr;
105    /// #
106    /// // Invalid: starts with special character
107    /// let mod_char = ModChar::from_str("@123")?;
108    /// # Ok::<(), nanalogue_core::Error>(())
109    /// ```
110    fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
111        let first_char = mod_type
112            .chars()
113            .next()
114            .ok_or(Error::EmptyModType(String::new()))?;
115        match first_char {
116            'A'..='Z' | 'a'..='z' if mod_type.len() == 1 => Ok(ModChar(first_char)),
117            '0'..='9' => {
118                let val = char::from_u32(mod_type.parse()?)
119                    .ok_or(Error::InvalidModType(mod_type.to_owned()))?;
120                Ok(ModChar(val))
121            }
122            _ => Err(Error::InvalidModType(mod_type.to_owned())),
123        }
124    }
125}
126
127impl fmt::Display for ModChar {
128    /// converts to string for display. If the value is in the alphabet,
129    /// display it. Otherwise, display the equivalent number.
130    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
131        match self.val() {
132            w @ ('A'..='Z' | 'a'..='z') => w.to_string(),
133            w => (w as u32).to_string(),
134        }
135        .fmt(f)
136    }
137}
138
139impl Serialize for ModChar {
140    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
141    where
142        S: serde::Serializer,
143    {
144        // Use the Display implementation to serialize as a string
145        serializer.serialize_str(&self.to_string())
146    }
147}
148
149impl<'de> Deserialize<'de> for ModChar {
150    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
151    where
152        D: serde::Deserializer<'de>,
153    {
154        let s = String::deserialize(deserializer)?;
155        ModChar::from_str(&s).map_err(serde::de::Error::custom)
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162
163    /// Tests if `ModChar` is displayed correctly
164    #[test]
165    fn display_mod_char() {
166        assert_eq!(
167            format!("{}", ModChar::from_str("a").expect("no failure")),
168            "a"
169        );
170        assert_eq!(
171            format!("{}", ModChar::from_str("T").expect("no failure")),
172            "T"
173        );
174        assert_eq!(
175            format!("{}", ModChar::from_str("77000").expect("no failure")),
176            "77000"
177        );
178    }
179
180    /// Tests `ModChar` numeric conversion and edge cases
181    #[expect(
182        clippy::shadow_unrelated,
183        reason = "repetition is fine; each block is clearly separated"
184    )]
185    #[test]
186    fn modchar_numeric_conversion() {
187        // Test letter codes
188        let mod_char = ModChar::from_str("m").expect("should parse");
189        assert_eq!(mod_char.val(), 'm');
190        assert_eq!(format!("{mod_char}"), "m");
191
192        let mod_char = ModChar::from_str("T").expect("should parse");
193        assert_eq!(mod_char.val(), 'T');
194        assert_eq!(format!("{mod_char}"), "T");
195
196        // Test small numeric codes
197        let mod_char = ModChar::from_str("123").expect("should parse");
198        assert_eq!(format!("{mod_char}"), "123");
199
200        // Test CheBI code for BrdU
201        let mod_char = ModChar::from_str("472232").expect("should parse");
202        assert_eq!(format!("{mod_char}"), "472232");
203
204        // Test ASCII boundary - 97 is 'a'
205        let mod_char = ModChar::from_str("97").expect("should parse");
206        assert_eq!(mod_char.val(), 'a');
207        // When the char value is in alphabet range, it displays as the letter, not the number
208        assert_eq!(format!("{mod_char}"), "a");
209
210        // Test very large numbers that are valid unicode
211        let mod_char = ModChar::from_str("65536").expect("should parse");
212        assert_eq!(format!("{mod_char}"), "65536");
213    }
214
215    #[test]
216    #[should_panic(expected = "EmptyModType")]
217    fn modchar_empty_string_panics() {
218        let _: ModChar = ModChar::from_str("").unwrap();
219    }
220
221    #[test]
222    #[should_panic(expected = "InvalidModType")]
223    fn modchar_special_char_at_panics() {
224        let _: ModChar = ModChar::from_str("@123").unwrap();
225    }
226
227    #[test]
228    #[should_panic(expected = "InvalidModType")]
229    fn modchar_special_char_hash_panics() {
230        let _: ModChar = ModChar::from_str("#abc").unwrap();
231    }
232
233    /// Tests `ModChar` display format consistency
234    #[test]
235    fn modchar_display_consistency() {
236        // Letters should display as letters
237        for letter in ['a', 'b', 'z', 'A', 'B', 'Z'] {
238            let mod_char = ModChar::new(letter);
239            assert_eq!(format!("{mod_char}"), letter.to_string());
240        }
241
242        // Numbers converted to char should display as their numeric value
243        let test_numbers = vec![123, 456, 789, 472_232];
244        for num in test_numbers {
245            let mod_char = ModChar::from_str(&num.to_string()).expect("should parse");
246            assert_eq!(format!("{mod_char}"), num.to_string());
247        }
248    }
249
250    /// Tests `From<char>` implementation for `ModChar`
251    #[expect(
252        clippy::shadow_unrelated,
253        reason = "repetition is fine; each block is clearly separated"
254    )]
255    #[test]
256    fn from_char() {
257        // Test lowercase letters
258        let mod_char = ModChar::from('a');
259        assert_eq!(mod_char.val(), 'a');
260
261        let mod_char = ModChar::from('z');
262        assert_eq!(mod_char.val(), 'z');
263
264        // Test uppercase letters
265        let mod_char = ModChar::from('A');
266        assert_eq!(mod_char.val(), 'A');
267
268        let mod_char = ModChar::from('Z');
269        assert_eq!(mod_char.val(), 'Z');
270
271        // Test numeric characters
272        let mod_char = ModChar::from('0');
273        assert_eq!(mod_char.val(), '0');
274
275        let mod_char = ModChar::from('9');
276        assert_eq!(mod_char.val(), '9');
277
278        // Test special characters
279        let mod_char = ModChar::from('@');
280        assert_eq!(mod_char.val(), '@');
281
282        let mod_char = ModChar::from('#');
283        assert_eq!(mod_char.val(), '#');
284
285        // Test unicode characters
286        let mod_char = ModChar::from('\u{D000}');
287        assert_eq!(mod_char.val(), '\u{D000}');
288
289        let mod_char = ModChar::from('\u{1F600}'); // emoji
290        assert_eq!(mod_char.val(), '\u{1F600}');
291    }
292
293    /// Tests `From<u8>` implementation for `ModChar`
294    #[test]
295    fn from_u8() {
296        // Test that u8 conversion is equivalent to char::from for all possible u8 values
297        for byte_val in 0u8..=255u8 {
298            let mod_char = ModChar::from(byte_val);
299            assert_eq!(mod_char.val(), char::from(byte_val));
300        }
301    }
302}