bed_utils/
bed.rs

1pub mod io;
2pub mod map;
3
4mod bed_trait;
5pub use bed_trait::*;
6mod score;
7use bincode::{Decode, Encode};
8pub use score::Score;
9mod strand;
10pub use strand::Strand;
11
12use std::{fmt::{self, Write}, ops::Deref, str::FromStr};
13
14#[cfg(feature = "serde")]
15use serde::{Deserialize, Serialize};
16
17const DELIMITER: char = '\t';
18const MISSING_ITEM : &str = ".";
19
20/// A minimal BED record with only 3 fields.
21#[derive(Encode, Decode, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
22#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
23pub struct GenomicRange(String, u64, u64);
24
25impl GenomicRange {
26    pub fn new<C>(chrom: C, start: u64, end: u64) -> Self
27    where
28        C: Into<String>,
29    { Self(chrom.into(), start, end) }
30
31    /// Convert the record to a string representation: chr:start-end
32    pub fn pretty_show(&self) -> String {
33        format!("{}:{}-{}", self.0, self.1, self.2)
34    }
35}
36
37/// Convert string to GenomicRange. '\t', ':', and '-' are all considered as
38/// valid delimiters. So any of the following formats is valid:
39/// * chr1\t100\t200
40/// * chr1:100-200
41impl FromStr for GenomicRange {
42    type Err = ParseError;
43
44    fn from_str(s: &str) -> Result<Self, Self::Err> {
45        let mut fields = s.split(&['\t', ':', '-']);
46        let chrom = parse_chrom(&mut fields)?;
47        let start = parse_start(&mut fields)?;
48        let end = parse_end(&mut fields)?;
49        Ok(GenomicRange::new(chrom, start, end))
50    }
51}
52
53impl BEDLike for GenomicRange {
54    fn chrom(&self) -> &str { &self.0 }
55    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
56        self.0 = chrom.to_string();
57        self
58    }
59    fn start(&self) -> u64 { self.1 }
60    fn set_start(&mut self, start: u64) -> &mut Self {
61        self.1 = start;
62        self
63    }
64    fn end(&self) -> u64 { self.2 }
65    fn set_end(&mut self, end: u64) -> &mut Self {
66        self.2 = end;
67        self
68    }
69    fn name(&self) -> Option<&str> { None }
70    fn score(&self) -> Option<Score> { None }
71    fn strand(&self) -> Option<Strand> { None }
72}
73
74impl fmt::Display for GenomicRange {
75    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
76        write!(f, "{}{}{}{}{}", self.chrom(), DELIMITER, self.start(),
77            DELIMITER, self.end()
78        )?;
79        Ok(())
80    }
81}
82
83
84/// A standard BED record.
85#[derive(Encode, Decode, Clone, Debug, Eq, PartialEq)]
86#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
87pub struct BED<const N: u8> {
88    chrom: String,
89    start: u64,
90    end: u64,
91    pub name: Option<String>,
92    pub score: Option<Score>,
93    pub strand: Option<Strand>,
94    pub optional_fields: OptionalFields,
95}
96
97impl<const N: u8> BED<N> {
98    pub fn new<C>(chrom: C, start: u64, end: u64, name: Option<String>,
99        score: Option<Score>, strand: Option<Strand>, optional_fields: OptionalFields) -> Self
100    where
101        C: Into<String>,
102    { Self { chrom: chrom.into(), start, end, name, score, strand, optional_fields } }
103}
104
105impl<const N: u8> BEDLike for BED<N> {
106    fn chrom(&self) -> &str { &self.chrom }
107    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
108        self.chrom = chrom.to_string();
109        self
110    }
111    fn start(&self) -> u64 { self.start }
112    fn set_start(&mut self, start: u64) -> &mut Self {
113        self.start = start;
114        self
115    }
116    fn end(&self) -> u64 { self.end }
117    fn set_end(&mut self, end: u64) -> &mut Self {
118        self.end = end;
119        self
120    }
121    fn name(&self) -> Option<&str> { self.name.as_deref() }
122    fn score(&self) -> Option<Score> { self.score }
123    fn strand(&self) -> Option<Strand> { self.strand }
124}
125
126// Display trait
127impl<const N: u8> fmt::Display for BED<N> {
128    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
129        write!(
130            f,
131            "{}{}{}{}{}",
132            self.chrom(),
133            DELIMITER,
134            self.start(),
135            DELIMITER,
136            self.end()
137        )?;
138        if N > 3 {
139            write!(f, "{}{}", DELIMITER, self.name().unwrap_or(MISSING_ITEM))?;
140            if N > 4 {
141                f.write_char(DELIMITER)?;
142                if let Some(score) = self.score() {
143                    write!(f, "{}", score)?;
144                } else { f.write_str(MISSING_ITEM)?; }
145
146                if N > 5 {
147                    f.write_char(DELIMITER)?;
148                    if let Some(strand) = self.strand() {
149                        write!(f, "{}", strand)?;
150                    } else { f.write_str(MISSING_ITEM)?; }
151                }
152            }
153        }
154        Ok(())
155    }
156}
157
158impl<const N: u8> FromStr for BED<N> {
159    type Err = ParseError;
160
161    fn from_str(s: &str) -> Result<Self, Self::Err> {
162        let mut fields = s.split(DELIMITER);
163        let chrom = parse_chrom(&mut fields)?;
164        let start = parse_start(&mut fields)?;
165        let end = parse_end(&mut fields)?;
166        let name = if N > 3 { parse_name(&mut fields)? } else { None };
167        let score = if N > 4 { parse_score(&mut fields)? } else { None };
168        let strand = if N > 5 { parse_strand(&mut fields)? } else { None };
169        Ok(BED::new(chrom, start, end, name, score, strand, OptionalFields::default()))
170    }
171}
172
173/// Generic BED record optional fields.
174#[derive(Encode, Decode, Clone, Debug, Default, Eq, PartialEq)]
175#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
176pub struct OptionalFields(Vec<String>);
177
178impl Deref for OptionalFields {
179    type Target = [String];
180
181    fn deref(&self) -> &Self::Target {
182        &self.0
183    }
184}
185
186impl fmt::Display for OptionalFields {
187    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
188        for (i, field) in self.0.iter().enumerate() {
189            if i > 0 {
190                f.write_char(DELIMITER)?;
191            }
192
193            f.write_str(field)?;
194        }
195
196        Ok(())
197    }
198}
199
200impl From<Vec<String>> for OptionalFields {
201    fn from(fields: Vec<String>) -> Self {
202        Self(fields)
203    }
204}
205
206
207/// A NarrowPeak record is a BED6+4 format that is used to store called peaks.
208#[derive(Encode, Decode, Clone, Debug, PartialEq)]
209#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
210pub struct NarrowPeak {
211    pub chrom: String,
212    pub start: u64,
213    pub end: u64,
214    pub name: Option<String>,
215    pub score: Option<Score>,
216    pub strand: Option<Strand>,
217    pub signal_value: f64,
218    pub p_value: Option<f64>, 
219    pub q_value: Option<f64>, 
220    pub peak: u64, 
221}
222
223impl BEDLike for NarrowPeak {
224    fn chrom(&self) -> &str { &self.chrom }
225    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
226        self.chrom = chrom.to_string();
227        self
228    }
229    fn start(&self) -> u64 { self.start }
230    fn set_start(&mut self, start: u64) -> &mut Self {
231        self.start = start;
232        self
233    }
234    fn end(&self) -> u64 { self.end }
235    fn set_end(&mut self, end: u64) -> &mut Self {
236        self.end = end;
237        self
238    }
239    fn name(&self) -> Option<&str> { self.name.as_deref() }
240    fn score(&self) -> Option<Score> { self.score }
241    fn strand(&self) -> Option<Strand> { self.strand }
242}
243
244impl fmt::Display for NarrowPeak {
245    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
246        write!(
247            f,
248            "{}{}{}{}{}{}{}",
249            self.chrom(),
250            DELIMITER, self.start(),
251            DELIMITER, self.end(),
252            DELIMITER, self.name().unwrap_or(MISSING_ITEM),
253        )?;
254
255        f.write_char(DELIMITER)?;
256        if let Some(x) = self.score() {
257            write!(f, "{}", x)?;
258        } else {
259            f.write_str(MISSING_ITEM)?;
260        }
261        f.write_char(DELIMITER)?;
262        if let Some(x) = self.strand() {
263            write!(f, "{}", x)?;
264        } else {
265            f.write_str(MISSING_ITEM)?;
266        }
267        write!(
268            f,
269            "{}{}{}{}{}{}{}{}",
270            DELIMITER, self.signal_value,
271            DELIMITER, self.p_value.unwrap_or(-1.0),
272            DELIMITER, self.q_value.unwrap_or(-1.0),
273            DELIMITER, self.peak,
274        )?;
275
276        Ok(())
277    }
278}
279
280impl FromStr for NarrowPeak {
281    type Err = ParseError;
282
283    fn from_str(s: &str) -> Result<Self, Self::Err> {
284        let mut fields = s.split(DELIMITER);
285        Ok(Self {
286            chrom: parse_chrom(&mut fields)?.to_string(),
287            start: parse_start(&mut fields)?,
288            end: parse_end(&mut fields)?,
289            name: parse_name(&mut fields)?,
290            score: parse_score(&mut fields)?,
291            strand: parse_strand(&mut fields)?,
292            signal_value: fields.next().unwrap().parse().unwrap(),
293            p_value: parse_pvalue(&mut fields).unwrap(),
294            q_value: parse_pvalue(&mut fields).unwrap(),
295            peak: fields.next().unwrap().parse().unwrap(),
296        })
297    }
298}
299
300/// A BroadPeak record is a BED6+4 format that is used to store called peaks.
301#[derive(Encode, Decode, Clone, Debug, PartialEq)]
302#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
303pub struct BroadPeak {
304    pub chrom: String,
305    pub start: u64,
306    pub end: u64,
307    pub name: Option<String>,
308    pub score: Option<Score>,
309    pub strand: Option<Strand>,
310    pub signal_value: f64,
311    pub p_value: Option<f64>, 
312    pub q_value: Option<f64>, 
313}
314
315impl BEDLike for BroadPeak {
316    fn chrom(&self) -> &str { &self.chrom }
317    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
318        self.chrom = chrom.to_string();
319        self
320    }
321    fn start(&self) -> u64 { self.start }
322    fn set_start(&mut self, start: u64) -> &mut Self {
323        self.start = start;
324        self
325    }
326    fn end(&self) -> u64 { self.end }
327    fn set_end(&mut self, end: u64) -> &mut Self {
328        self.end = end;
329        self
330    }
331    fn name(&self) -> Option<&str> { self.name.as_deref() }
332    fn score(&self) -> Option<Score> { self.score }
333    fn strand(&self) -> Option<Strand> { self.strand }
334}
335
336impl fmt::Display for BroadPeak {
337    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
338        write!(
339            f,
340            "{}{}{}{}{}{}{}",
341            self.chrom(),
342            DELIMITER, self.start(),
343            DELIMITER, self.end(),
344            DELIMITER, self.name().unwrap_or(MISSING_ITEM),
345        )?;
346
347        f.write_char(DELIMITER)?;
348        if let Some(x) = self.score() {
349            write!(f, "{}", x)?;
350        } else {
351            f.write_str(MISSING_ITEM)?;
352        }
353        f.write_char(DELIMITER)?;
354        if let Some(x) = self.strand() {
355            write!(f, "{}", x)?;
356        } else {
357            f.write_str(MISSING_ITEM)?;
358        }
359        write!(
360            f,
361            "{}{}{}{}{}{}",
362            DELIMITER, self.signal_value,
363            DELIMITER, self.p_value.unwrap_or(-1.0),
364            DELIMITER, self.q_value.unwrap_or(-1.0),
365        )?;
366
367        Ok(())
368    }
369}
370
371impl FromStr for BroadPeak {
372    type Err = ParseError;
373
374    fn from_str(s: &str) -> Result<Self, Self::Err> {
375        let mut fields = s.split(DELIMITER);
376        Ok(Self {
377            chrom: parse_chrom(&mut fields)?.to_string(),
378            start: parse_start(&mut fields)?,
379            end: parse_end(&mut fields)?,
380            name: parse_name(&mut fields)?,
381            score: parse_score(&mut fields)?,
382            strand: parse_strand(&mut fields)?,
383            signal_value: fields.next().unwrap().parse().unwrap(),
384            p_value: parse_pvalue(&mut fields).unwrap(),
385            q_value: parse_pvalue(&mut fields).unwrap(),
386        })
387    }
388}
389
390/// The bedGraph format allows display of continuous-valued data in track format.
391/// This display type is useful for probability scores and transcriptome data. 
392#[derive(Encode, Decode, Clone, Debug, PartialEq)]
393#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
394pub struct BedGraph<V> {
395    pub chrom: String,
396    pub start: u64,
397    pub end: u64,
398    pub value: V,
399}
400
401impl<V> BedGraph<V> {
402    pub fn new<C>(chrom: C, start: u64, end: u64, value: V) -> Self
403    where
404        C: Into<String>,
405    { Self { chrom: chrom.into(), start, end, value } }
406
407    pub fn from_bed<B: BEDLike>(bed: &B, value: V) -> Self {
408        Self::new(bed.chrom(), bed.start(), bed.end(), value)
409    }
410}
411
412impl<V> BEDLike for BedGraph<V> {
413    fn chrom(&self) -> &str { &self.chrom }
414    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
415        self.chrom = chrom.to_string();
416        self
417    }
418    fn start(&self) -> u64 { self.start }
419    fn set_start(&mut self, start: u64) -> &mut Self {
420        self.start = start;
421        self
422    }
423    fn end(&self) -> u64 { self.end }
424    fn set_end(&mut self, end: u64) -> &mut Self {
425        self.end = end;
426        self
427    }
428    fn name(&self) -> Option<&str> { None }
429    fn score(&self) -> Option<Score> { None }
430    fn strand(&self) -> Option<Strand> { None }
431}
432
433impl<V> fmt::Display for BedGraph<V>
434where
435    V: fmt::Display,
436{
437    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
438    {
439        write!(
440            f,
441            "{}{}{}{}{}{}{}",
442            self.chrom(),
443            DELIMITER, self.start(),
444            DELIMITER, self.end(),
445            DELIMITER, self.value,
446        )
447    }
448}
449
450impl<V> FromStr for BedGraph<V>
451where
452    V: FromStr,
453    <V as FromStr>::Err: std::fmt::Debug,
454{
455    type Err = ParseError;
456
457    fn from_str(s: &str) -> Result<Self, Self::Err>
458    {
459        let mut fields = s.split(DELIMITER);
460        Ok(Self {
461            chrom: parse_chrom(&mut fields)?.to_string(),
462            start: parse_start(&mut fields)?,
463            end: parse_end(&mut fields)?,
464            value: fields.next().unwrap().parse().unwrap(),
465        })
466    }
467}
468
469fn parse_chrom<'a, I>(fields: &mut I) -> Result<&'a str, ParseError>
470where
471    I: Iterator<Item = &'a str>,
472{
473    fields
474        .next()
475        .ok_or(ParseError::MissingReferenceSequenceName)
476}
477
478fn parse_start<'a, I>(fields: &mut I) -> Result<u64, ParseError>
479where
480    I: Iterator<Item = &'a str>,
481{
482    fields
483        .next()
484        .ok_or(ParseError::MissingStartPosition)
485        .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidStartPosition))
486}
487
488fn parse_end<'a, I>(fields: &mut I) -> Result<u64, ParseError>
489where
490    I: Iterator<Item = &'a str>,
491{
492    fields
493        .next()
494        .ok_or(ParseError::MissingEndPosition)
495        .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidEndPosition))
496}
497
498fn parse_name<'a, I>(fields: &mut I) -> Result<Option<String>, ParseError>
499where
500    I: Iterator<Item = &'a str>,
501{
502    fields
503        .next()
504        .ok_or(ParseError::MissingName)
505        .map(|s| match s {
506            MISSING_ITEM => None,
507            _ => Some(s.into()),
508        })
509}
510
511fn parse_score<'a, I>(fields: &mut I) -> Result<Option<Score>, ParseError>
512where
513    I: Iterator<Item = &'a str>,
514{
515    fields
516        .next()
517        .ok_or(ParseError::MissingScore)
518        .and_then(|s| match s {
519            MISSING_ITEM => Ok(None),
520            _ => s.parse().map(Some).map_err(ParseError::InvalidScore),
521        })
522}
523
524fn parse_strand<'a, I>(fields: &mut I) -> Result<Option<Strand>, ParseError>
525where
526    I: Iterator<Item = &'a str>,
527{
528    fields
529        .next()
530        .ok_or(ParseError::MissingStrand)
531        .and_then(|s| match s {
532            MISSING_ITEM => Ok(None),
533            _ => s.parse().map(Some).map_err(ParseError::InvalidStrand),
534        })
535}
536
537fn parse_pvalue<'a, I>(fields: &mut I) -> Result<Option<f64>, ParseError>
538where
539    I: Iterator<Item = &'a str>,
540{
541    fields
542        .next()
543        .ok_or(ParseError::MissingScore)
544        .and_then(|s| {
545            let p = s.parse().unwrap();
546            if p < 0.0 { Ok(None) } else { Ok(Some(p)) }
547        })
548}
549
550/// An error returned when a raw BED record fails to parse.
551#[derive(Clone, Debug, Eq, PartialEq)]
552pub enum ParseError {
553    /// The reference sequence name is missing.
554    MissingReferenceSequenceName,
555    /// The start position is missing.
556    MissingStartPosition,
557    /// The start position is invalid.
558    InvalidStartPosition(lexical::Error),
559    /// The end position is missing.
560    MissingEndPosition,
561    /// The end position is invalid.
562    InvalidEndPosition(lexical::Error),
563    /// The name is missing.
564    MissingName,
565    /// The score is missing.
566    MissingScore,
567    /// The score is invalid.
568    InvalidScore(score::ParseError),
569    /// The strand is missing.
570    MissingStrand,
571    /// The strand is invalid.
572    InvalidStrand(strand::ParseError),
573}
574
575#[cfg(test)]
576mod bed_tests {
577    use super::*;
578
579    #[test]
580    fn test_fmt() {
581        let fields = OptionalFields::default();
582        assert_eq!(fields.to_string(), "");
583
584        let fields = OptionalFields::from(vec![String::from("n")]);
585        assert_eq!(fields.to_string(), "n");
586
587        let fields = OptionalFields::from(vec![String::from("n"), String::from("d")]);
588        assert_eq!(fields.to_string(), "n\td");
589
590        let genomic_range = GenomicRange::new("chr1", 100, 200);
591        assert_eq!(genomic_range, GenomicRange::from_str("chr1\t100\t200").unwrap());
592        assert_eq!(genomic_range, GenomicRange::from_str("chr1-100-200").unwrap());
593        assert_eq!(genomic_range, GenomicRange::from_str("chr1:100-200").unwrap());
594        assert_eq!(genomic_range, GenomicRange::from_str("chr1:100:200").unwrap());
595        assert_eq!(genomic_range, GenomicRange::from_str(&genomic_range.pretty_show()).unwrap());
596    }
597}